Home
last modified time | relevance | path

Searched refs:_sum3 (Results 1 – 25 of 137) sorted by relevance

123456

/dports/misc/ncnn/ncnn-20211208/src/layer/riscv/
H A Dconvolution_7x7_pack1ton.h84 _sum3 = vfmacc_vf_f32m1(_sum3, r0[6], _k00, vl); in conv7x7s2_pack1ton_rvv()
92 _sum3 = vfmacc_vf_f32m1(_sum3, r0[7], _k01, vl); in conv7x7s2_pack1ton_rvv()
100 _sum3 = vfmacc_vf_f32m1(_sum3, r0[8], _k02, vl); in conv7x7s2_pack1ton_rvv()
108 _sum3 = vfmacc_vf_f32m1(_sum3, r0[9], _k03, vl); in conv7x7s2_pack1ton_rvv()
116 _sum3 = vfmacc_vf_f32m1(_sum3, r0[10], _k04, vl); in conv7x7s2_pack1ton_rvv()
124 _sum3 = vfmacc_vf_f32m1(_sum3, r0[11], _k05, vl); in conv7x7s2_pack1ton_rvv()
132 _sum3 = vfmacc_vf_f32m1(_sum3, r0[12], _k06, vl); in conv7x7s2_pack1ton_rvv()
151 _sum3 = vfmacc_vf_f32m1(_sum3, r1[6], _k10, vl); in conv7x7s2_pack1ton_rvv()
159 _sum3 = vfmacc_vf_f32m1(_sum3, r1[7], _k11, vl); in conv7x7s2_pack1ton_rvv()
167 _sum3 = vfmacc_vf_f32m1(_sum3, r1[8], _k12, vl); in conv7x7s2_pack1ton_rvv()
[all …]
H A Dconvolution_7x7_pack1ton_fp16s.h84 _sum3 = vfmacc_vf_f16m1(_sum3, r0[6], _k00, vl); in conv7x7s2_pack1ton_fp16sa_rvv()
92 _sum3 = vfmacc_vf_f16m1(_sum3, r0[7], _k01, vl); in conv7x7s2_pack1ton_fp16sa_rvv()
100 _sum3 = vfmacc_vf_f16m1(_sum3, r0[8], _k02, vl); in conv7x7s2_pack1ton_fp16sa_rvv()
108 _sum3 = vfmacc_vf_f16m1(_sum3, r0[9], _k03, vl); in conv7x7s2_pack1ton_fp16sa_rvv()
116 _sum3 = vfmacc_vf_f16m1(_sum3, r0[10], _k04, vl); in conv7x7s2_pack1ton_fp16sa_rvv()
124 _sum3 = vfmacc_vf_f16m1(_sum3, r0[11], _k05, vl); in conv7x7s2_pack1ton_fp16sa_rvv()
132 _sum3 = vfmacc_vf_f16m1(_sum3, r0[12], _k06, vl); in conv7x7s2_pack1ton_fp16sa_rvv()
151 _sum3 = vfmacc_vf_f16m1(_sum3, r1[6], _k10, vl); in conv7x7s2_pack1ton_fp16sa_rvv()
159 _sum3 = vfmacc_vf_f16m1(_sum3, r1[7], _k11, vl); in conv7x7s2_pack1ton_fp16sa_rvv()
167 _sum3 = vfmacc_vf_f16m1(_sum3, r1[8], _k12, vl); in conv7x7s2_pack1ton_fp16sa_rvv()
[all …]
H A Dconvolution_3x3_pack1ton.h76 _sum3 = vfmacc_vf_f32m1(_sum3, r0[3], _k00, vl); in conv3x3s1_pack1ton_rvv()
84 _sum3 = vfmacc_vf_f32m1(_sum3, r0[4], _k01, vl); in conv3x3s1_pack1ton_rvv()
92 _sum3 = vfmacc_vf_f32m1(_sum3, r0[5], _k02, vl); in conv3x3s1_pack1ton_rvv()
101 _sum3 = vfmacc_vf_f32m1(_sum3, r1[3], _k10, vl); in conv3x3s1_pack1ton_rvv()
109 _sum3 = vfmacc_vf_f32m1(_sum3, r1[4], _k11, vl); in conv3x3s1_pack1ton_rvv()
117 _sum3 = vfmacc_vf_f32m1(_sum3, r1[5], _k12, vl); in conv3x3s1_pack1ton_rvv()
126 _sum3 = vfmacc_vf_f32m1(_sum3, r2[3], _k20, vl); in conv3x3s1_pack1ton_rvv()
134 _sum3 = vfmacc_vf_f32m1(_sum3, r2[4], _k21, vl); in conv3x3s1_pack1ton_rvv()
142 _sum3 = vfmacc_vf_f32m1(_sum3, r2[5], _k22, vl); in conv3x3s1_pack1ton_rvv()
173 _sum3 = vfmacc_vf_f32m1(_sum3, r0[3], _k00, vl); in conv3x3s1_pack1ton_rvv()
[all …]
H A Dconvolution_3x3_pack1ton_fp16s.h76 _sum3 = vfmacc_vf_f16m1(_sum3, r0[3], _k00, vl); in conv3x3s1_pack1ton_fp16sa_rvv()
84 _sum3 = vfmacc_vf_f16m1(_sum3, r0[4], _k01, vl); in conv3x3s1_pack1ton_fp16sa_rvv()
92 _sum3 = vfmacc_vf_f16m1(_sum3, r0[5], _k02, vl); in conv3x3s1_pack1ton_fp16sa_rvv()
101 _sum3 = vfmacc_vf_f16m1(_sum3, r1[3], _k10, vl); in conv3x3s1_pack1ton_fp16sa_rvv()
109 _sum3 = vfmacc_vf_f16m1(_sum3, r1[4], _k11, vl); in conv3x3s1_pack1ton_fp16sa_rvv()
117 _sum3 = vfmacc_vf_f16m1(_sum3, r1[5], _k12, vl); in conv3x3s1_pack1ton_fp16sa_rvv()
126 _sum3 = vfmacc_vf_f16m1(_sum3, r2[3], _k20, vl); in conv3x3s1_pack1ton_fp16sa_rvv()
134 _sum3 = vfmacc_vf_f16m1(_sum3, r2[4], _k21, vl); in conv3x3s1_pack1ton_fp16sa_rvv()
142 _sum3 = vfmacc_vf_f16m1(_sum3, r2[5], _k22, vl); in conv3x3s1_pack1ton_fp16sa_rvv()
173 _sum3 = vfmacc_vf_f16m1(_sum3, r0[3], _k00, vl); in conv3x3s1_pack1ton_fp16sa_rvv()
[all …]
/dports/misc/ncnn/ncnn-20211208/src/layer/mips/
H A Dconvolution_7x7_pack1to4.h95 _sum3 = __msa_fmadd_w(_sum3, _r06, _k00); in conv7x7s2_pack1to4_msa()
99 _sum3 = __msa_fmadd_w(_sum3, _r07, _k01); in conv7x7s2_pack1to4_msa()
103 _sum3 = __msa_fmadd_w(_sum3, _r08, _k02); in conv7x7s2_pack1to4_msa()
107 _sum3 = __msa_fmadd_w(_sum3, _r09, _k03); in conv7x7s2_pack1to4_msa()
111 _sum3 = __msa_fmadd_w(_sum3, _r0a, _k04); in conv7x7s2_pack1to4_msa()
115 _sum3 = __msa_fmadd_w(_sum3, _r0b, _k05); in conv7x7s2_pack1to4_msa()
119 _sum3 = __msa_fmadd_w(_sum3, _r0c, _k06); in conv7x7s2_pack1to4_msa()
152 _sum3 = __msa_fmadd_w(_sum3, _r16, _k10); in conv7x7s2_pack1to4_msa()
156 _sum3 = __msa_fmadd_w(_sum3, _r17, _k11); in conv7x7s2_pack1to4_msa()
160 _sum3 = __msa_fmadd_w(_sum3, _r18, _k12); in conv7x7s2_pack1to4_msa()
[all …]
H A Dconvolution_3x3_pack1to4.h88 _sum3 = __msa_fmadd_w(_sum3, _r03, _k00); in conv3x3s1_pack1to4_msa()
96 _sum3 = __msa_fmadd_w(_sum3, _r04, _k01); in conv3x3s1_pack1to4_msa()
104 _sum3 = __msa_fmadd_w(_sum3, _r05, _k02); in conv3x3s1_pack1to4_msa()
128 _sum3 = __msa_fmadd_w(_sum3, _r13, _k10); in conv3x3s1_pack1to4_msa()
136 _sum3 = __msa_fmadd_w(_sum3, _r14, _k11); in conv3x3s1_pack1to4_msa()
144 _sum3 = __msa_fmadd_w(_sum3, _r15, _k12); in conv3x3s1_pack1to4_msa()
168 _sum3 = __msa_fmadd_w(_sum3, _r23, _k20); in conv3x3s1_pack1to4_msa()
176 _sum3 = __msa_fmadd_w(_sum3, _r24, _k21); in conv3x3s1_pack1to4_msa()
184 _sum3 = __msa_fmadd_w(_sum3, _r25, _k22); in conv3x3s1_pack1to4_msa()
225 _sum3 = __msa_fmadd_w(_sum3, _r03, _k00); in conv3x3s1_pack1to4_msa()
[all …]
/dports/graphics/vapoursynth-waifu2x-ncnn-vulkan/vapoursynth-waifu2x-ncnn-vulkan-r4/deps/ncnn/src/layer/x86/
H A Dconvolutiondepthwise_3x3_pack8.h118 _sum3 = _mm256_fmadd_ps(_k00, _r03, _sum3); in convdw3x3s1_pack8_avx()
119 _sum3 = _mm256_fmadd_ps(_k01, _r04, _sum3); in convdw3x3s1_pack8_avx()
120 _sum3 = _mm256_fmadd_ps(_k02, _r05, _sum3); in convdw3x3s1_pack8_avx()
121 _sum3 = _mm256_fmadd_ps(_k10, _r13, _sum3); in convdw3x3s1_pack8_avx()
122 _sum3 = _mm256_fmadd_ps(_k11, _r14, _sum3); in convdw3x3s1_pack8_avx()
123 _sum3 = _mm256_fmadd_ps(_k12, _r15, _sum3); in convdw3x3s1_pack8_avx()
124 _sum3 = _mm256_fmadd_ps(_k20, _r23, _sum3); in convdw3x3s1_pack8_avx()
125 _sum3 = _mm256_fmadd_ps(_k21, _r24, _sum3); in convdw3x3s1_pack8_avx()
126 _sum3 = _mm256_fmadd_ps(_k22, _r25, _sum3); in convdw3x3s1_pack8_avx()
260 _sum3 = _mm256_fmadd_ps(_k00, _r03, _sum3); in convdw3x3s1_pack8_avx()
[all …]
H A Dconvolutiondepthwise_3x3_pack8_fp16.h118 _sum3 = _mm256_fmadd_ps(_k00, _r03, _sum3); in convdw3x3s1_fp16_pack8_avx()
119 _sum3 = _mm256_fmadd_ps(_k01, _r04, _sum3); in convdw3x3s1_fp16_pack8_avx()
120 _sum3 = _mm256_fmadd_ps(_k02, _r05, _sum3); in convdw3x3s1_fp16_pack8_avx()
121 _sum3 = _mm256_fmadd_ps(_k10, _r13, _sum3); in convdw3x3s1_fp16_pack8_avx()
122 _sum3 = _mm256_fmadd_ps(_k11, _r14, _sum3); in convdw3x3s1_fp16_pack8_avx()
123 _sum3 = _mm256_fmadd_ps(_k12, _r15, _sum3); in convdw3x3s1_fp16_pack8_avx()
124 _sum3 = _mm256_fmadd_ps(_k20, _r23, _sum3); in convdw3x3s1_fp16_pack8_avx()
125 _sum3 = _mm256_fmadd_ps(_k21, _r24, _sum3); in convdw3x3s1_fp16_pack8_avx()
126 _sum3 = _mm256_fmadd_ps(_k22, _r25, _sum3); in convdw3x3s1_fp16_pack8_avx()
260 _sum3 = _mm256_fmadd_ps(_k00, _r03, _sum3); in convdw3x3s1_fp16_pack8_avx()
[all …]
H A Dinnerproduct_x86.cpp210 _sum3 = _mm256_fmadd_ps(_val, _k3, _sum3); in forward()
931 _sum3 = _mm256_fmadd_ps(_val3, _w3, _sum3); in forward()
958 _sum3 = _mm256_fmadd_ps(_val3, _w3, _sum3); in forward()
1035 _sum3 = _mm_fmadd_ps(_val3, _w3, _sum3); in forward()
1063 _sum3 = _mm_add_ps(_mm_mul_ps(_val3, _w3), _sum3); in forward()
1155 _sum3 = _mm256_fmadd_ps(_m, _w3, _sum3); in forward()
1251 _sum3 = _mm256_fmadd_ps(_m, _w3, _sum3); in forward()
1468 _sum3 = _mm256_fmadd_ps(_m, _w3, _sum3); in forward_fp16()
1533 _sum3 = _mm256_fmadd_ps(_m, _w3, _sum3); in forward_fp16()
1599 _sum3 = _mm256_fmadd_ps(_m, _w3, _sum3); in forward_fp16()
[all …]
/dports/graphics/waifu2x-ncnn-vulkan/waifu2x-ncnn-vulkan-20210521/src/ncnn/src/layer/x86/
H A Dconvolutiondepthwise_3x3_pack8_fp16.h118 _sum3 = _mm256_fmadd_ps(_k00, _r03, _sum3); in convdw3x3s1_fp16_pack8_avx()
119 _sum3 = _mm256_fmadd_ps(_k01, _r04, _sum3); in convdw3x3s1_fp16_pack8_avx()
120 _sum3 = _mm256_fmadd_ps(_k02, _r05, _sum3); in convdw3x3s1_fp16_pack8_avx()
121 _sum3 = _mm256_fmadd_ps(_k10, _r13, _sum3); in convdw3x3s1_fp16_pack8_avx()
122 _sum3 = _mm256_fmadd_ps(_k11, _r14, _sum3); in convdw3x3s1_fp16_pack8_avx()
123 _sum3 = _mm256_fmadd_ps(_k12, _r15, _sum3); in convdw3x3s1_fp16_pack8_avx()
124 _sum3 = _mm256_fmadd_ps(_k20, _r23, _sum3); in convdw3x3s1_fp16_pack8_avx()
125 _sum3 = _mm256_fmadd_ps(_k21, _r24, _sum3); in convdw3x3s1_fp16_pack8_avx()
126 _sum3 = _mm256_fmadd_ps(_k22, _r25, _sum3); in convdw3x3s1_fp16_pack8_avx()
260 _sum3 = _mm256_fmadd_ps(_k00, _r03, _sum3); in convdw3x3s1_fp16_pack8_avx()
[all …]
H A Dconvolutiondepthwise_3x3_pack8.h118 _sum3 = _mm256_fmadd_ps(_k00, _r03, _sum3); in convdw3x3s1_pack8_avx()
119 _sum3 = _mm256_fmadd_ps(_k01, _r04, _sum3); in convdw3x3s1_pack8_avx()
120 _sum3 = _mm256_fmadd_ps(_k02, _r05, _sum3); in convdw3x3s1_pack8_avx()
121 _sum3 = _mm256_fmadd_ps(_k10, _r13, _sum3); in convdw3x3s1_pack8_avx()
122 _sum3 = _mm256_fmadd_ps(_k11, _r14, _sum3); in convdw3x3s1_pack8_avx()
123 _sum3 = _mm256_fmadd_ps(_k12, _r15, _sum3); in convdw3x3s1_pack8_avx()
124 _sum3 = _mm256_fmadd_ps(_k20, _r23, _sum3); in convdw3x3s1_pack8_avx()
125 _sum3 = _mm256_fmadd_ps(_k21, _r24, _sum3); in convdw3x3s1_pack8_avx()
126 _sum3 = _mm256_fmadd_ps(_k22, _r25, _sum3); in convdw3x3s1_pack8_avx()
260 _sum3 = _mm256_fmadd_ps(_k00, _r03, _sum3); in convdw3x3s1_pack8_avx()
[all …]
H A Dinnerproduct_x86.cpp210 _sum3 = _mm256_fmadd_ps(_val, _k3, _sum3); in forward()
931 _sum3 = _mm256_fmadd_ps(_val3, _w3, _sum3); in forward()
958 _sum3 = _mm256_fmadd_ps(_val3, _w3, _sum3); in forward()
1035 _sum3 = _mm_fmadd_ps(_val3, _w3, _sum3); in forward()
1063 _sum3 = _mm_add_ps(_mm_mul_ps(_val3, _w3), _sum3); in forward()
1155 _sum3 = _mm256_fmadd_ps(_m, _w3, _sum3); in forward()
1251 _sum3 = _mm256_fmadd_ps(_m, _w3, _sum3); in forward()
1468 _sum3 = _mm256_fmadd_ps(_m, _w3, _sum3); in forward_fp16()
1533 _sum3 = _mm256_fmadd_ps(_m, _w3, _sum3); in forward_fp16()
1599 _sum3 = _mm256_fmadd_ps(_m, _w3, _sum3); in forward_fp16()
[all …]
/dports/misc/ncnn/ncnn-20211208/src/layer/x86/
H A Dconvolutiondepthwise_3x3_pack4.h118 _sum3 = _mm_comp_fmadd_ps(_k00, _r03, _sum3); in convdw3x3s1_pack4_sse()
119 _sum3 = _mm_comp_fmadd_ps(_k01, _r04, _sum3); in convdw3x3s1_pack4_sse()
120 _sum3 = _mm_comp_fmadd_ps(_k02, _r05, _sum3); in convdw3x3s1_pack4_sse()
121 _sum3 = _mm_comp_fmadd_ps(_k10, _r13, _sum3); in convdw3x3s1_pack4_sse()
122 _sum3 = _mm_comp_fmadd_ps(_k11, _r14, _sum3); in convdw3x3s1_pack4_sse()
123 _sum3 = _mm_comp_fmadd_ps(_k12, _r15, _sum3); in convdw3x3s1_pack4_sse()
124 _sum3 = _mm_comp_fmadd_ps(_k20, _r23, _sum3); in convdw3x3s1_pack4_sse()
125 _sum3 = _mm_comp_fmadd_ps(_k21, _r24, _sum3); in convdw3x3s1_pack4_sse()
126 _sum3 = _mm_comp_fmadd_ps(_k22, _r25, _sum3); in convdw3x3s1_pack4_sse()
260 _sum3 = _mm_comp_fmadd_ps(_k00, _r03, _sum3); in convdw3x3s1_pack4_sse()
[all …]
H A Dconvolutiondepthwise_3x3_pack8_fp16.h118 _sum3 = _mm256_comp_fmadd_ps(_k00, _r03, _sum3); in convdw3x3s1_fp16_pack8_avx()
119 _sum3 = _mm256_comp_fmadd_ps(_k01, _r04, _sum3); in convdw3x3s1_fp16_pack8_avx()
120 _sum3 = _mm256_comp_fmadd_ps(_k02, _r05, _sum3); in convdw3x3s1_fp16_pack8_avx()
121 _sum3 = _mm256_comp_fmadd_ps(_k10, _r13, _sum3); in convdw3x3s1_fp16_pack8_avx()
122 _sum3 = _mm256_comp_fmadd_ps(_k11, _r14, _sum3); in convdw3x3s1_fp16_pack8_avx()
123 _sum3 = _mm256_comp_fmadd_ps(_k12, _r15, _sum3); in convdw3x3s1_fp16_pack8_avx()
124 _sum3 = _mm256_comp_fmadd_ps(_k20, _r23, _sum3); in convdw3x3s1_fp16_pack8_avx()
125 _sum3 = _mm256_comp_fmadd_ps(_k21, _r24, _sum3); in convdw3x3s1_fp16_pack8_avx()
126 _sum3 = _mm256_comp_fmadd_ps(_k22, _r25, _sum3); in convdw3x3s1_fp16_pack8_avx()
260 _sum3 = _mm256_comp_fmadd_ps(_k00, _r03, _sum3); in convdw3x3s1_fp16_pack8_avx()
[all …]
H A Dconvolutiondepthwise_3x3_pack8.h118 _sum3 = _mm256_comp_fmadd_ps(_k00, _r03, _sum3); in convdw3x3s1_pack8_avx()
119 _sum3 = _mm256_comp_fmadd_ps(_k01, _r04, _sum3); in convdw3x3s1_pack8_avx()
120 _sum3 = _mm256_comp_fmadd_ps(_k02, _r05, _sum3); in convdw3x3s1_pack8_avx()
121 _sum3 = _mm256_comp_fmadd_ps(_k10, _r13, _sum3); in convdw3x3s1_pack8_avx()
122 _sum3 = _mm256_comp_fmadd_ps(_k11, _r14, _sum3); in convdw3x3s1_pack8_avx()
123 _sum3 = _mm256_comp_fmadd_ps(_k12, _r15, _sum3); in convdw3x3s1_pack8_avx()
124 _sum3 = _mm256_comp_fmadd_ps(_k20, _r23, _sum3); in convdw3x3s1_pack8_avx()
125 _sum3 = _mm256_comp_fmadd_ps(_k21, _r24, _sum3); in convdw3x3s1_pack8_avx()
126 _sum3 = _mm256_comp_fmadd_ps(_k22, _r25, _sum3); in convdw3x3s1_pack8_avx()
260 _sum3 = _mm256_comp_fmadd_ps(_k00, _r03, _sum3); in convdw3x3s1_pack8_avx()
[all …]
H A Dinnerproduct_x86.cpp210 _sum3 = _mm256_comp_fmadd_ps(_val, _k3, _sum3); in forward()
931 _sum3 = _mm256_comp_fmadd_ps(_val3, _w3, _sum3); in forward()
958 _sum3 = _mm256_comp_fmadd_ps(_val3, _w3, _sum3); in forward()
1035 _sum3 = _mm_comp_fmadd_ps(_val3, _w3, _sum3); in forward()
1063 _sum3 = _mm_add_ps(_mm_mul_ps(_val3, _w3), _sum3); in forward()
1155 _sum3 = _mm256_comp_fmadd_ps(_m, _w3, _sum3); in forward()
1251 _sum3 = _mm256_comp_fmadd_ps(_m, _w3, _sum3); in forward()
1468 _sum3 = _mm256_comp_fmadd_ps(_m, _w3, _sum3); in forward_fp16()
1533 _sum3 = _mm256_comp_fmadd_ps(_m, _w3, _sum3); in forward_fp16()
1599 _sum3 = _mm256_comp_fmadd_ps(_m, _w3, _sum3); in forward_fp16()
[all …]
/dports/benchmarks/vkpeak/vkpeak-20210430/ncnn/src/layer/x86/
H A Dconvolutiondepthwise_3x3_pack8.h118 _sum3 = _mm256_fmadd_ps(_k00, _r03, _sum3); in convdw3x3s1_pack8_avx()
119 _sum3 = _mm256_fmadd_ps(_k01, _r04, _sum3); in convdw3x3s1_pack8_avx()
120 _sum3 = _mm256_fmadd_ps(_k02, _r05, _sum3); in convdw3x3s1_pack8_avx()
121 _sum3 = _mm256_fmadd_ps(_k10, _r13, _sum3); in convdw3x3s1_pack8_avx()
122 _sum3 = _mm256_fmadd_ps(_k11, _r14, _sum3); in convdw3x3s1_pack8_avx()
123 _sum3 = _mm256_fmadd_ps(_k12, _r15, _sum3); in convdw3x3s1_pack8_avx()
124 _sum3 = _mm256_fmadd_ps(_k20, _r23, _sum3); in convdw3x3s1_pack8_avx()
125 _sum3 = _mm256_fmadd_ps(_k21, _r24, _sum3); in convdw3x3s1_pack8_avx()
126 _sum3 = _mm256_fmadd_ps(_k22, _r25, _sum3); in convdw3x3s1_pack8_avx()
260 _sum3 = _mm256_fmadd_ps(_k00, _r03, _sum3); in convdw3x3s1_pack8_avx()
[all …]
H A Dconvolutiondepthwise_3x3_pack8_fp16.h118 _sum3 = _mm256_fmadd_ps(_k00, _r03, _sum3); in convdw3x3s1_fp16_pack8_avx()
119 _sum3 = _mm256_fmadd_ps(_k01, _r04, _sum3); in convdw3x3s1_fp16_pack8_avx()
120 _sum3 = _mm256_fmadd_ps(_k02, _r05, _sum3); in convdw3x3s1_fp16_pack8_avx()
121 _sum3 = _mm256_fmadd_ps(_k10, _r13, _sum3); in convdw3x3s1_fp16_pack8_avx()
122 _sum3 = _mm256_fmadd_ps(_k11, _r14, _sum3); in convdw3x3s1_fp16_pack8_avx()
123 _sum3 = _mm256_fmadd_ps(_k12, _r15, _sum3); in convdw3x3s1_fp16_pack8_avx()
124 _sum3 = _mm256_fmadd_ps(_k20, _r23, _sum3); in convdw3x3s1_fp16_pack8_avx()
125 _sum3 = _mm256_fmadd_ps(_k21, _r24, _sum3); in convdw3x3s1_fp16_pack8_avx()
126 _sum3 = _mm256_fmadd_ps(_k22, _r25, _sum3); in convdw3x3s1_fp16_pack8_avx()
260 _sum3 = _mm256_fmadd_ps(_k00, _r03, _sum3); in convdw3x3s1_fp16_pack8_avx()
[all …]
H A Dinnerproduct_x86.cpp210 _sum3 = _mm256_fmadd_ps(_val, _k3, _sum3); in forward()
931 _sum3 = _mm256_fmadd_ps(_val3, _w3, _sum3); in forward()
958 _sum3 = _mm256_fmadd_ps(_val3, _w3, _sum3); in forward()
1035 _sum3 = _mm_fmadd_ps(_val3, _w3, _sum3); in forward()
1063 _sum3 = _mm_add_ps(_mm_mul_ps(_val3, _w3), _sum3); in forward()
1155 _sum3 = _mm256_fmadd_ps(_m, _w3, _sum3); in forward()
1251 _sum3 = _mm256_fmadd_ps(_m, _w3, _sum3); in forward()
1468 _sum3 = _mm256_fmadd_ps(_m, _w3, _sum3); in forward_fp16()
1533 _sum3 = _mm256_fmadd_ps(_m, _w3, _sum3); in forward_fp16()
1599 _sum3 = _mm256_fmadd_ps(_m, _w3, _sum3); in forward_fp16()
[all …]
/dports/graphics/realsr-ncnn-vulkan/realsr-ncnn-vulkan-20210210/src/ncnn/src/layer/x86/
H A Dconvolutiondepthwise_3x3_pack8_fp16.h118 _sum3 = _mm256_fmadd_ps(_k00, _r03, _sum3); in convdw3x3s1_fp16_pack8_avx()
119 _sum3 = _mm256_fmadd_ps(_k01, _r04, _sum3); in convdw3x3s1_fp16_pack8_avx()
120 _sum3 = _mm256_fmadd_ps(_k02, _r05, _sum3); in convdw3x3s1_fp16_pack8_avx()
121 _sum3 = _mm256_fmadd_ps(_k10, _r13, _sum3); in convdw3x3s1_fp16_pack8_avx()
122 _sum3 = _mm256_fmadd_ps(_k11, _r14, _sum3); in convdw3x3s1_fp16_pack8_avx()
123 _sum3 = _mm256_fmadd_ps(_k12, _r15, _sum3); in convdw3x3s1_fp16_pack8_avx()
124 _sum3 = _mm256_fmadd_ps(_k20, _r23, _sum3); in convdw3x3s1_fp16_pack8_avx()
125 _sum3 = _mm256_fmadd_ps(_k21, _r24, _sum3); in convdw3x3s1_fp16_pack8_avx()
126 _sum3 = _mm256_fmadd_ps(_k22, _r25, _sum3); in convdw3x3s1_fp16_pack8_avx()
260 _sum3 = _mm256_fmadd_ps(_k00, _r03, _sum3); in convdw3x3s1_fp16_pack8_avx()
[all …]
H A Dconvolutiondepthwise_3x3_pack8.h118 _sum3 = _mm256_fmadd_ps(_k00, _r03, _sum3); in convdw3x3s1_pack8_avx()
119 _sum3 = _mm256_fmadd_ps(_k01, _r04, _sum3); in convdw3x3s1_pack8_avx()
120 _sum3 = _mm256_fmadd_ps(_k02, _r05, _sum3); in convdw3x3s1_pack8_avx()
121 _sum3 = _mm256_fmadd_ps(_k10, _r13, _sum3); in convdw3x3s1_pack8_avx()
122 _sum3 = _mm256_fmadd_ps(_k11, _r14, _sum3); in convdw3x3s1_pack8_avx()
123 _sum3 = _mm256_fmadd_ps(_k12, _r15, _sum3); in convdw3x3s1_pack8_avx()
124 _sum3 = _mm256_fmadd_ps(_k20, _r23, _sum3); in convdw3x3s1_pack8_avx()
125 _sum3 = _mm256_fmadd_ps(_k21, _r24, _sum3); in convdw3x3s1_pack8_avx()
126 _sum3 = _mm256_fmadd_ps(_k22, _r25, _sum3); in convdw3x3s1_pack8_avx()
260 _sum3 = _mm256_fmadd_ps(_k00, _r03, _sum3); in convdw3x3s1_pack8_avx()
[all …]
H A Dinnerproduct_x86.cpp203 _sum3 = _mm256_fmadd_ps(_val, _k3, _sum3); in forward()
947 _sum3 = _mm256_fmadd_ps(_val3, _w3, _sum3); in forward()
974 _sum3 = _mm256_fmadd_ps(_val3, _w3, _sum3); in forward()
1051 _sum3 = _mm_fmadd_ps(_val3, _w3, _sum3); in forward()
1079 _sum3 = _mm_add_ps(_mm_mul_ps(_val3, _w3), _sum3); in forward()
1171 _sum3 = _mm256_fmadd_ps(_m, _w3, _sum3); in forward()
1267 _sum3 = _mm256_fmadd_ps(_m, _w3, _sum3); in forward()
1509 _sum3 = _mm256_fmadd_ps(_m, _w3, _sum3); in forward_fp16()
1574 _sum3 = _mm256_fmadd_ps(_m, _w3, _sum3); in forward_fp16()
1640 _sum3 = _mm256_fmadd_ps(_m, _w3, _sum3); in forward_fp16()
[all …]
/dports/graphics/vapoursynth-waifu2x-ncnn-vulkan/vapoursynth-waifu2x-ncnn-vulkan-r4/deps/ncnn/src/layer/arm/
H A Dconvolution_3x3_pack8to4_int8.h933 _sum3 = vmlal_lane_s16(_sum3, vget_high_s16(_w0), vget_low_s16(_val0), 1); in conv3x3s1_winograd42_pack8to4_int8_neon()
952 _sum3 = vmlal_lane_s16(_sum3, vget_high_s16(_w1), vget_low_s16(_val1), 1); in conv3x3s1_winograd42_pack8to4_int8_neon()
971 _sum3 = vmlal_lane_s16(_sum3, vget_high_s16(_w2), vget_low_s16(_val2), 1); in conv3x3s1_winograd42_pack8to4_int8_neon()
990 _sum3 = vmlal_lane_s16(_sum3, vget_high_s16(_w3), vget_low_s16(_val3), 1); in conv3x3s1_winograd42_pack8to4_int8_neon()
1009 _sum3 = vmlal_lane_s16(_sum3, vget_high_s16(_w4), vget_low_s16(_val4), 1); in conv3x3s1_winograd42_pack8to4_int8_neon()
1028 _sum3 = vmlal_lane_s16(_sum3, vget_high_s16(_w5), vget_low_s16(_val5), 1); in conv3x3s1_winograd42_pack8to4_int8_neon()
1047 _sum3 = vmlal_lane_s16(_sum3, vget_high_s16(_w6), vget_low_s16(_val6), 1); in conv3x3s1_winograd42_pack8to4_int8_neon()
1066 _sum3 = vmlal_lane_s16(_sum3, vget_high_s16(_w7), vget_low_s16(_val7), 1); in conv3x3s1_winograd42_pack8to4_int8_neon()
1137 _sum3 = vmlal_lane_s16(_sum3, vget_high_s16(_w0), vget_low_s16(_val1), 0); in conv3x3s1_winograd42_pack8to4_int8_neon()
1148 _sum3 = vmlal_lane_s16(_sum3, vget_high_s16(_w1), vget_low_s16(_val1), 1); in conv3x3s1_winograd42_pack8to4_int8_neon()
[all …]
/dports/graphics/waifu2x-ncnn-vulkan/waifu2x-ncnn-vulkan-20210521/src/ncnn/src/layer/arm/
H A Dconvolution_3x3_pack8to4_int8.h933 _sum3 = vmlal_lane_s16(_sum3, vget_high_s16(_w0), vget_low_s16(_val0), 1); in conv3x3s1_winograd42_pack8to4_int8_neon()
952 _sum3 = vmlal_lane_s16(_sum3, vget_high_s16(_w1), vget_low_s16(_val1), 1); in conv3x3s1_winograd42_pack8to4_int8_neon()
971 _sum3 = vmlal_lane_s16(_sum3, vget_high_s16(_w2), vget_low_s16(_val2), 1); in conv3x3s1_winograd42_pack8to4_int8_neon()
990 _sum3 = vmlal_lane_s16(_sum3, vget_high_s16(_w3), vget_low_s16(_val3), 1); in conv3x3s1_winograd42_pack8to4_int8_neon()
1009 _sum3 = vmlal_lane_s16(_sum3, vget_high_s16(_w4), vget_low_s16(_val4), 1); in conv3x3s1_winograd42_pack8to4_int8_neon()
1028 _sum3 = vmlal_lane_s16(_sum3, vget_high_s16(_w5), vget_low_s16(_val5), 1); in conv3x3s1_winograd42_pack8to4_int8_neon()
1047 _sum3 = vmlal_lane_s16(_sum3, vget_high_s16(_w6), vget_low_s16(_val6), 1); in conv3x3s1_winograd42_pack8to4_int8_neon()
1066 _sum3 = vmlal_lane_s16(_sum3, vget_high_s16(_w7), vget_low_s16(_val7), 1); in conv3x3s1_winograd42_pack8to4_int8_neon()
1137 _sum3 = vmlal_lane_s16(_sum3, vget_high_s16(_w0), vget_low_s16(_val1), 0); in conv3x3s1_winograd42_pack8to4_int8_neon()
1148 _sum3 = vmlal_lane_s16(_sum3, vget_high_s16(_w1), vget_low_s16(_val1), 1); in conv3x3s1_winograd42_pack8to4_int8_neon()
[all …]
/dports/benchmarks/vkpeak/vkpeak-20210430/ncnn/src/layer/arm/
H A Dconvolution_3x3_pack8to4_int8.h933 _sum3 = vmlal_lane_s16(_sum3, vget_high_s16(_w0), vget_low_s16(_val0), 1); in conv3x3s1_winograd42_pack8to4_int8_neon()
952 _sum3 = vmlal_lane_s16(_sum3, vget_high_s16(_w1), vget_low_s16(_val1), 1); in conv3x3s1_winograd42_pack8to4_int8_neon()
971 _sum3 = vmlal_lane_s16(_sum3, vget_high_s16(_w2), vget_low_s16(_val2), 1); in conv3x3s1_winograd42_pack8to4_int8_neon()
990 _sum3 = vmlal_lane_s16(_sum3, vget_high_s16(_w3), vget_low_s16(_val3), 1); in conv3x3s1_winograd42_pack8to4_int8_neon()
1009 _sum3 = vmlal_lane_s16(_sum3, vget_high_s16(_w4), vget_low_s16(_val4), 1); in conv3x3s1_winograd42_pack8to4_int8_neon()
1028 _sum3 = vmlal_lane_s16(_sum3, vget_high_s16(_w5), vget_low_s16(_val5), 1); in conv3x3s1_winograd42_pack8to4_int8_neon()
1047 _sum3 = vmlal_lane_s16(_sum3, vget_high_s16(_w6), vget_low_s16(_val6), 1); in conv3x3s1_winograd42_pack8to4_int8_neon()
1066 _sum3 = vmlal_lane_s16(_sum3, vget_high_s16(_w7), vget_low_s16(_val7), 1); in conv3x3s1_winograd42_pack8to4_int8_neon()
1137 _sum3 = vmlal_lane_s16(_sum3, vget_high_s16(_w0), vget_low_s16(_val1), 0); in conv3x3s1_winograd42_pack8to4_int8_neon()
1148 _sum3 = vmlal_lane_s16(_sum3, vget_high_s16(_w1), vget_low_s16(_val1), 1); in conv3x3s1_winograd42_pack8to4_int8_neon()
[all …]

123456