/dports/misc/ncnn/ncnn-20211208/src/layer/riscv/ |
H A D | convolution_7x7_pack1ton.h | 84 _sum3 = vfmacc_vf_f32m1(_sum3, r0[6], _k00, vl); in conv7x7s2_pack1ton_rvv() 92 _sum3 = vfmacc_vf_f32m1(_sum3, r0[7], _k01, vl); in conv7x7s2_pack1ton_rvv() 100 _sum3 = vfmacc_vf_f32m1(_sum3, r0[8], _k02, vl); in conv7x7s2_pack1ton_rvv() 108 _sum3 = vfmacc_vf_f32m1(_sum3, r0[9], _k03, vl); in conv7x7s2_pack1ton_rvv() 116 _sum3 = vfmacc_vf_f32m1(_sum3, r0[10], _k04, vl); in conv7x7s2_pack1ton_rvv() 124 _sum3 = vfmacc_vf_f32m1(_sum3, r0[11], _k05, vl); in conv7x7s2_pack1ton_rvv() 132 _sum3 = vfmacc_vf_f32m1(_sum3, r0[12], _k06, vl); in conv7x7s2_pack1ton_rvv() 151 _sum3 = vfmacc_vf_f32m1(_sum3, r1[6], _k10, vl); in conv7x7s2_pack1ton_rvv() 159 _sum3 = vfmacc_vf_f32m1(_sum3, r1[7], _k11, vl); in conv7x7s2_pack1ton_rvv() 167 _sum3 = vfmacc_vf_f32m1(_sum3, r1[8], _k12, vl); in conv7x7s2_pack1ton_rvv() [all …]
|
H A D | convolution_7x7_pack1ton_fp16s.h | 84 _sum3 = vfmacc_vf_f16m1(_sum3, r0[6], _k00, vl); in conv7x7s2_pack1ton_fp16sa_rvv() 92 _sum3 = vfmacc_vf_f16m1(_sum3, r0[7], _k01, vl); in conv7x7s2_pack1ton_fp16sa_rvv() 100 _sum3 = vfmacc_vf_f16m1(_sum3, r0[8], _k02, vl); in conv7x7s2_pack1ton_fp16sa_rvv() 108 _sum3 = vfmacc_vf_f16m1(_sum3, r0[9], _k03, vl); in conv7x7s2_pack1ton_fp16sa_rvv() 116 _sum3 = vfmacc_vf_f16m1(_sum3, r0[10], _k04, vl); in conv7x7s2_pack1ton_fp16sa_rvv() 124 _sum3 = vfmacc_vf_f16m1(_sum3, r0[11], _k05, vl); in conv7x7s2_pack1ton_fp16sa_rvv() 132 _sum3 = vfmacc_vf_f16m1(_sum3, r0[12], _k06, vl); in conv7x7s2_pack1ton_fp16sa_rvv() 151 _sum3 = vfmacc_vf_f16m1(_sum3, r1[6], _k10, vl); in conv7x7s2_pack1ton_fp16sa_rvv() 159 _sum3 = vfmacc_vf_f16m1(_sum3, r1[7], _k11, vl); in conv7x7s2_pack1ton_fp16sa_rvv() 167 _sum3 = vfmacc_vf_f16m1(_sum3, r1[8], _k12, vl); in conv7x7s2_pack1ton_fp16sa_rvv() [all …]
|
H A D | convolution_3x3_pack1ton.h | 76 _sum3 = vfmacc_vf_f32m1(_sum3, r0[3], _k00, vl); in conv3x3s1_pack1ton_rvv() 84 _sum3 = vfmacc_vf_f32m1(_sum3, r0[4], _k01, vl); in conv3x3s1_pack1ton_rvv() 92 _sum3 = vfmacc_vf_f32m1(_sum3, r0[5], _k02, vl); in conv3x3s1_pack1ton_rvv() 101 _sum3 = vfmacc_vf_f32m1(_sum3, r1[3], _k10, vl); in conv3x3s1_pack1ton_rvv() 109 _sum3 = vfmacc_vf_f32m1(_sum3, r1[4], _k11, vl); in conv3x3s1_pack1ton_rvv() 117 _sum3 = vfmacc_vf_f32m1(_sum3, r1[5], _k12, vl); in conv3x3s1_pack1ton_rvv() 126 _sum3 = vfmacc_vf_f32m1(_sum3, r2[3], _k20, vl); in conv3x3s1_pack1ton_rvv() 134 _sum3 = vfmacc_vf_f32m1(_sum3, r2[4], _k21, vl); in conv3x3s1_pack1ton_rvv() 142 _sum3 = vfmacc_vf_f32m1(_sum3, r2[5], _k22, vl); in conv3x3s1_pack1ton_rvv() 173 _sum3 = vfmacc_vf_f32m1(_sum3, r0[3], _k00, vl); in conv3x3s1_pack1ton_rvv() [all …]
|
H A D | convolution_3x3_pack1ton_fp16s.h | 76 _sum3 = vfmacc_vf_f16m1(_sum3, r0[3], _k00, vl); in conv3x3s1_pack1ton_fp16sa_rvv() 84 _sum3 = vfmacc_vf_f16m1(_sum3, r0[4], _k01, vl); in conv3x3s1_pack1ton_fp16sa_rvv() 92 _sum3 = vfmacc_vf_f16m1(_sum3, r0[5], _k02, vl); in conv3x3s1_pack1ton_fp16sa_rvv() 101 _sum3 = vfmacc_vf_f16m1(_sum3, r1[3], _k10, vl); in conv3x3s1_pack1ton_fp16sa_rvv() 109 _sum3 = vfmacc_vf_f16m1(_sum3, r1[4], _k11, vl); in conv3x3s1_pack1ton_fp16sa_rvv() 117 _sum3 = vfmacc_vf_f16m1(_sum3, r1[5], _k12, vl); in conv3x3s1_pack1ton_fp16sa_rvv() 126 _sum3 = vfmacc_vf_f16m1(_sum3, r2[3], _k20, vl); in conv3x3s1_pack1ton_fp16sa_rvv() 134 _sum3 = vfmacc_vf_f16m1(_sum3, r2[4], _k21, vl); in conv3x3s1_pack1ton_fp16sa_rvv() 142 _sum3 = vfmacc_vf_f16m1(_sum3, r2[5], _k22, vl); in conv3x3s1_pack1ton_fp16sa_rvv() 173 _sum3 = vfmacc_vf_f16m1(_sum3, r0[3], _k00, vl); in conv3x3s1_pack1ton_fp16sa_rvv() [all …]
|
/dports/misc/ncnn/ncnn-20211208/src/layer/mips/ |
H A D | convolution_7x7_pack1to4.h | 95 _sum3 = __msa_fmadd_w(_sum3, _r06, _k00); in conv7x7s2_pack1to4_msa() 99 _sum3 = __msa_fmadd_w(_sum3, _r07, _k01); in conv7x7s2_pack1to4_msa() 103 _sum3 = __msa_fmadd_w(_sum3, _r08, _k02); in conv7x7s2_pack1to4_msa() 107 _sum3 = __msa_fmadd_w(_sum3, _r09, _k03); in conv7x7s2_pack1to4_msa() 111 _sum3 = __msa_fmadd_w(_sum3, _r0a, _k04); in conv7x7s2_pack1to4_msa() 115 _sum3 = __msa_fmadd_w(_sum3, _r0b, _k05); in conv7x7s2_pack1to4_msa() 119 _sum3 = __msa_fmadd_w(_sum3, _r0c, _k06); in conv7x7s2_pack1to4_msa() 152 _sum3 = __msa_fmadd_w(_sum3, _r16, _k10); in conv7x7s2_pack1to4_msa() 156 _sum3 = __msa_fmadd_w(_sum3, _r17, _k11); in conv7x7s2_pack1to4_msa() 160 _sum3 = __msa_fmadd_w(_sum3, _r18, _k12); in conv7x7s2_pack1to4_msa() [all …]
|
H A D | convolution_3x3_pack1to4.h | 88 _sum3 = __msa_fmadd_w(_sum3, _r03, _k00); in conv3x3s1_pack1to4_msa() 96 _sum3 = __msa_fmadd_w(_sum3, _r04, _k01); in conv3x3s1_pack1to4_msa() 104 _sum3 = __msa_fmadd_w(_sum3, _r05, _k02); in conv3x3s1_pack1to4_msa() 128 _sum3 = __msa_fmadd_w(_sum3, _r13, _k10); in conv3x3s1_pack1to4_msa() 136 _sum3 = __msa_fmadd_w(_sum3, _r14, _k11); in conv3x3s1_pack1to4_msa() 144 _sum3 = __msa_fmadd_w(_sum3, _r15, _k12); in conv3x3s1_pack1to4_msa() 168 _sum3 = __msa_fmadd_w(_sum3, _r23, _k20); in conv3x3s1_pack1to4_msa() 176 _sum3 = __msa_fmadd_w(_sum3, _r24, _k21); in conv3x3s1_pack1to4_msa() 184 _sum3 = __msa_fmadd_w(_sum3, _r25, _k22); in conv3x3s1_pack1to4_msa() 225 _sum3 = __msa_fmadd_w(_sum3, _r03, _k00); in conv3x3s1_pack1to4_msa() [all …]
|
/dports/graphics/vapoursynth-waifu2x-ncnn-vulkan/vapoursynth-waifu2x-ncnn-vulkan-r4/deps/ncnn/src/layer/x86/ |
H A D | convolutiondepthwise_3x3_pack8.h | 118 _sum3 = _mm256_fmadd_ps(_k00, _r03, _sum3); in convdw3x3s1_pack8_avx() 119 _sum3 = _mm256_fmadd_ps(_k01, _r04, _sum3); in convdw3x3s1_pack8_avx() 120 _sum3 = _mm256_fmadd_ps(_k02, _r05, _sum3); in convdw3x3s1_pack8_avx() 121 _sum3 = _mm256_fmadd_ps(_k10, _r13, _sum3); in convdw3x3s1_pack8_avx() 122 _sum3 = _mm256_fmadd_ps(_k11, _r14, _sum3); in convdw3x3s1_pack8_avx() 123 _sum3 = _mm256_fmadd_ps(_k12, _r15, _sum3); in convdw3x3s1_pack8_avx() 124 _sum3 = _mm256_fmadd_ps(_k20, _r23, _sum3); in convdw3x3s1_pack8_avx() 125 _sum3 = _mm256_fmadd_ps(_k21, _r24, _sum3); in convdw3x3s1_pack8_avx() 126 _sum3 = _mm256_fmadd_ps(_k22, _r25, _sum3); in convdw3x3s1_pack8_avx() 260 _sum3 = _mm256_fmadd_ps(_k00, _r03, _sum3); in convdw3x3s1_pack8_avx() [all …]
|
H A D | convolutiondepthwise_3x3_pack8_fp16.h | 118 _sum3 = _mm256_fmadd_ps(_k00, _r03, _sum3); in convdw3x3s1_fp16_pack8_avx() 119 _sum3 = _mm256_fmadd_ps(_k01, _r04, _sum3); in convdw3x3s1_fp16_pack8_avx() 120 _sum3 = _mm256_fmadd_ps(_k02, _r05, _sum3); in convdw3x3s1_fp16_pack8_avx() 121 _sum3 = _mm256_fmadd_ps(_k10, _r13, _sum3); in convdw3x3s1_fp16_pack8_avx() 122 _sum3 = _mm256_fmadd_ps(_k11, _r14, _sum3); in convdw3x3s1_fp16_pack8_avx() 123 _sum3 = _mm256_fmadd_ps(_k12, _r15, _sum3); in convdw3x3s1_fp16_pack8_avx() 124 _sum3 = _mm256_fmadd_ps(_k20, _r23, _sum3); in convdw3x3s1_fp16_pack8_avx() 125 _sum3 = _mm256_fmadd_ps(_k21, _r24, _sum3); in convdw3x3s1_fp16_pack8_avx() 126 _sum3 = _mm256_fmadd_ps(_k22, _r25, _sum3); in convdw3x3s1_fp16_pack8_avx() 260 _sum3 = _mm256_fmadd_ps(_k00, _r03, _sum3); in convdw3x3s1_fp16_pack8_avx() [all …]
|
H A D | innerproduct_x86.cpp | 210 _sum3 = _mm256_fmadd_ps(_val, _k3, _sum3); in forward() 931 _sum3 = _mm256_fmadd_ps(_val3, _w3, _sum3); in forward() 958 _sum3 = _mm256_fmadd_ps(_val3, _w3, _sum3); in forward() 1035 _sum3 = _mm_fmadd_ps(_val3, _w3, _sum3); in forward() 1063 _sum3 = _mm_add_ps(_mm_mul_ps(_val3, _w3), _sum3); in forward() 1155 _sum3 = _mm256_fmadd_ps(_m, _w3, _sum3); in forward() 1251 _sum3 = _mm256_fmadd_ps(_m, _w3, _sum3); in forward() 1468 _sum3 = _mm256_fmadd_ps(_m, _w3, _sum3); in forward_fp16() 1533 _sum3 = _mm256_fmadd_ps(_m, _w3, _sum3); in forward_fp16() 1599 _sum3 = _mm256_fmadd_ps(_m, _w3, _sum3); in forward_fp16() [all …]
|
/dports/graphics/waifu2x-ncnn-vulkan/waifu2x-ncnn-vulkan-20210521/src/ncnn/src/layer/x86/ |
H A D | convolutiondepthwise_3x3_pack8_fp16.h | 118 _sum3 = _mm256_fmadd_ps(_k00, _r03, _sum3); in convdw3x3s1_fp16_pack8_avx() 119 _sum3 = _mm256_fmadd_ps(_k01, _r04, _sum3); in convdw3x3s1_fp16_pack8_avx() 120 _sum3 = _mm256_fmadd_ps(_k02, _r05, _sum3); in convdw3x3s1_fp16_pack8_avx() 121 _sum3 = _mm256_fmadd_ps(_k10, _r13, _sum3); in convdw3x3s1_fp16_pack8_avx() 122 _sum3 = _mm256_fmadd_ps(_k11, _r14, _sum3); in convdw3x3s1_fp16_pack8_avx() 123 _sum3 = _mm256_fmadd_ps(_k12, _r15, _sum3); in convdw3x3s1_fp16_pack8_avx() 124 _sum3 = _mm256_fmadd_ps(_k20, _r23, _sum3); in convdw3x3s1_fp16_pack8_avx() 125 _sum3 = _mm256_fmadd_ps(_k21, _r24, _sum3); in convdw3x3s1_fp16_pack8_avx() 126 _sum3 = _mm256_fmadd_ps(_k22, _r25, _sum3); in convdw3x3s1_fp16_pack8_avx() 260 _sum3 = _mm256_fmadd_ps(_k00, _r03, _sum3); in convdw3x3s1_fp16_pack8_avx() [all …]
|
H A D | convolutiondepthwise_3x3_pack8.h | 118 _sum3 = _mm256_fmadd_ps(_k00, _r03, _sum3); in convdw3x3s1_pack8_avx() 119 _sum3 = _mm256_fmadd_ps(_k01, _r04, _sum3); in convdw3x3s1_pack8_avx() 120 _sum3 = _mm256_fmadd_ps(_k02, _r05, _sum3); in convdw3x3s1_pack8_avx() 121 _sum3 = _mm256_fmadd_ps(_k10, _r13, _sum3); in convdw3x3s1_pack8_avx() 122 _sum3 = _mm256_fmadd_ps(_k11, _r14, _sum3); in convdw3x3s1_pack8_avx() 123 _sum3 = _mm256_fmadd_ps(_k12, _r15, _sum3); in convdw3x3s1_pack8_avx() 124 _sum3 = _mm256_fmadd_ps(_k20, _r23, _sum3); in convdw3x3s1_pack8_avx() 125 _sum3 = _mm256_fmadd_ps(_k21, _r24, _sum3); in convdw3x3s1_pack8_avx() 126 _sum3 = _mm256_fmadd_ps(_k22, _r25, _sum3); in convdw3x3s1_pack8_avx() 260 _sum3 = _mm256_fmadd_ps(_k00, _r03, _sum3); in convdw3x3s1_pack8_avx() [all …]
|
H A D | innerproduct_x86.cpp | 210 _sum3 = _mm256_fmadd_ps(_val, _k3, _sum3); in forward() 931 _sum3 = _mm256_fmadd_ps(_val3, _w3, _sum3); in forward() 958 _sum3 = _mm256_fmadd_ps(_val3, _w3, _sum3); in forward() 1035 _sum3 = _mm_fmadd_ps(_val3, _w3, _sum3); in forward() 1063 _sum3 = _mm_add_ps(_mm_mul_ps(_val3, _w3), _sum3); in forward() 1155 _sum3 = _mm256_fmadd_ps(_m, _w3, _sum3); in forward() 1251 _sum3 = _mm256_fmadd_ps(_m, _w3, _sum3); in forward() 1468 _sum3 = _mm256_fmadd_ps(_m, _w3, _sum3); in forward_fp16() 1533 _sum3 = _mm256_fmadd_ps(_m, _w3, _sum3); in forward_fp16() 1599 _sum3 = _mm256_fmadd_ps(_m, _w3, _sum3); in forward_fp16() [all …]
|
/dports/misc/ncnn/ncnn-20211208/src/layer/x86/ |
H A D | convolutiondepthwise_3x3_pack4.h | 118 _sum3 = _mm_comp_fmadd_ps(_k00, _r03, _sum3); in convdw3x3s1_pack4_sse() 119 _sum3 = _mm_comp_fmadd_ps(_k01, _r04, _sum3); in convdw3x3s1_pack4_sse() 120 _sum3 = _mm_comp_fmadd_ps(_k02, _r05, _sum3); in convdw3x3s1_pack4_sse() 121 _sum3 = _mm_comp_fmadd_ps(_k10, _r13, _sum3); in convdw3x3s1_pack4_sse() 122 _sum3 = _mm_comp_fmadd_ps(_k11, _r14, _sum3); in convdw3x3s1_pack4_sse() 123 _sum3 = _mm_comp_fmadd_ps(_k12, _r15, _sum3); in convdw3x3s1_pack4_sse() 124 _sum3 = _mm_comp_fmadd_ps(_k20, _r23, _sum3); in convdw3x3s1_pack4_sse() 125 _sum3 = _mm_comp_fmadd_ps(_k21, _r24, _sum3); in convdw3x3s1_pack4_sse() 126 _sum3 = _mm_comp_fmadd_ps(_k22, _r25, _sum3); in convdw3x3s1_pack4_sse() 260 _sum3 = _mm_comp_fmadd_ps(_k00, _r03, _sum3); in convdw3x3s1_pack4_sse() [all …]
|
H A D | convolutiondepthwise_3x3_pack8_fp16.h | 118 _sum3 = _mm256_comp_fmadd_ps(_k00, _r03, _sum3); in convdw3x3s1_fp16_pack8_avx() 119 _sum3 = _mm256_comp_fmadd_ps(_k01, _r04, _sum3); in convdw3x3s1_fp16_pack8_avx() 120 _sum3 = _mm256_comp_fmadd_ps(_k02, _r05, _sum3); in convdw3x3s1_fp16_pack8_avx() 121 _sum3 = _mm256_comp_fmadd_ps(_k10, _r13, _sum3); in convdw3x3s1_fp16_pack8_avx() 122 _sum3 = _mm256_comp_fmadd_ps(_k11, _r14, _sum3); in convdw3x3s1_fp16_pack8_avx() 123 _sum3 = _mm256_comp_fmadd_ps(_k12, _r15, _sum3); in convdw3x3s1_fp16_pack8_avx() 124 _sum3 = _mm256_comp_fmadd_ps(_k20, _r23, _sum3); in convdw3x3s1_fp16_pack8_avx() 125 _sum3 = _mm256_comp_fmadd_ps(_k21, _r24, _sum3); in convdw3x3s1_fp16_pack8_avx() 126 _sum3 = _mm256_comp_fmadd_ps(_k22, _r25, _sum3); in convdw3x3s1_fp16_pack8_avx() 260 _sum3 = _mm256_comp_fmadd_ps(_k00, _r03, _sum3); in convdw3x3s1_fp16_pack8_avx() [all …]
|
H A D | convolutiondepthwise_3x3_pack8.h | 118 _sum3 = _mm256_comp_fmadd_ps(_k00, _r03, _sum3); in convdw3x3s1_pack8_avx() 119 _sum3 = _mm256_comp_fmadd_ps(_k01, _r04, _sum3); in convdw3x3s1_pack8_avx() 120 _sum3 = _mm256_comp_fmadd_ps(_k02, _r05, _sum3); in convdw3x3s1_pack8_avx() 121 _sum3 = _mm256_comp_fmadd_ps(_k10, _r13, _sum3); in convdw3x3s1_pack8_avx() 122 _sum3 = _mm256_comp_fmadd_ps(_k11, _r14, _sum3); in convdw3x3s1_pack8_avx() 123 _sum3 = _mm256_comp_fmadd_ps(_k12, _r15, _sum3); in convdw3x3s1_pack8_avx() 124 _sum3 = _mm256_comp_fmadd_ps(_k20, _r23, _sum3); in convdw3x3s1_pack8_avx() 125 _sum3 = _mm256_comp_fmadd_ps(_k21, _r24, _sum3); in convdw3x3s1_pack8_avx() 126 _sum3 = _mm256_comp_fmadd_ps(_k22, _r25, _sum3); in convdw3x3s1_pack8_avx() 260 _sum3 = _mm256_comp_fmadd_ps(_k00, _r03, _sum3); in convdw3x3s1_pack8_avx() [all …]
|
H A D | innerproduct_x86.cpp | 210 _sum3 = _mm256_comp_fmadd_ps(_val, _k3, _sum3); in forward() 931 _sum3 = _mm256_comp_fmadd_ps(_val3, _w3, _sum3); in forward() 958 _sum3 = _mm256_comp_fmadd_ps(_val3, _w3, _sum3); in forward() 1035 _sum3 = _mm_comp_fmadd_ps(_val3, _w3, _sum3); in forward() 1063 _sum3 = _mm_add_ps(_mm_mul_ps(_val3, _w3), _sum3); in forward() 1155 _sum3 = _mm256_comp_fmadd_ps(_m, _w3, _sum3); in forward() 1251 _sum3 = _mm256_comp_fmadd_ps(_m, _w3, _sum3); in forward() 1468 _sum3 = _mm256_comp_fmadd_ps(_m, _w3, _sum3); in forward_fp16() 1533 _sum3 = _mm256_comp_fmadd_ps(_m, _w3, _sum3); in forward_fp16() 1599 _sum3 = _mm256_comp_fmadd_ps(_m, _w3, _sum3); in forward_fp16() [all …]
|
/dports/benchmarks/vkpeak/vkpeak-20210430/ncnn/src/layer/x86/ |
H A D | convolutiondepthwise_3x3_pack8.h | 118 _sum3 = _mm256_fmadd_ps(_k00, _r03, _sum3); in convdw3x3s1_pack8_avx() 119 _sum3 = _mm256_fmadd_ps(_k01, _r04, _sum3); in convdw3x3s1_pack8_avx() 120 _sum3 = _mm256_fmadd_ps(_k02, _r05, _sum3); in convdw3x3s1_pack8_avx() 121 _sum3 = _mm256_fmadd_ps(_k10, _r13, _sum3); in convdw3x3s1_pack8_avx() 122 _sum3 = _mm256_fmadd_ps(_k11, _r14, _sum3); in convdw3x3s1_pack8_avx() 123 _sum3 = _mm256_fmadd_ps(_k12, _r15, _sum3); in convdw3x3s1_pack8_avx() 124 _sum3 = _mm256_fmadd_ps(_k20, _r23, _sum3); in convdw3x3s1_pack8_avx() 125 _sum3 = _mm256_fmadd_ps(_k21, _r24, _sum3); in convdw3x3s1_pack8_avx() 126 _sum3 = _mm256_fmadd_ps(_k22, _r25, _sum3); in convdw3x3s1_pack8_avx() 260 _sum3 = _mm256_fmadd_ps(_k00, _r03, _sum3); in convdw3x3s1_pack8_avx() [all …]
|
H A D | convolutiondepthwise_3x3_pack8_fp16.h | 118 _sum3 = _mm256_fmadd_ps(_k00, _r03, _sum3); in convdw3x3s1_fp16_pack8_avx() 119 _sum3 = _mm256_fmadd_ps(_k01, _r04, _sum3); in convdw3x3s1_fp16_pack8_avx() 120 _sum3 = _mm256_fmadd_ps(_k02, _r05, _sum3); in convdw3x3s1_fp16_pack8_avx() 121 _sum3 = _mm256_fmadd_ps(_k10, _r13, _sum3); in convdw3x3s1_fp16_pack8_avx() 122 _sum3 = _mm256_fmadd_ps(_k11, _r14, _sum3); in convdw3x3s1_fp16_pack8_avx() 123 _sum3 = _mm256_fmadd_ps(_k12, _r15, _sum3); in convdw3x3s1_fp16_pack8_avx() 124 _sum3 = _mm256_fmadd_ps(_k20, _r23, _sum3); in convdw3x3s1_fp16_pack8_avx() 125 _sum3 = _mm256_fmadd_ps(_k21, _r24, _sum3); in convdw3x3s1_fp16_pack8_avx() 126 _sum3 = _mm256_fmadd_ps(_k22, _r25, _sum3); in convdw3x3s1_fp16_pack8_avx() 260 _sum3 = _mm256_fmadd_ps(_k00, _r03, _sum3); in convdw3x3s1_fp16_pack8_avx() [all …]
|
H A D | innerproduct_x86.cpp | 210 _sum3 = _mm256_fmadd_ps(_val, _k3, _sum3); in forward() 931 _sum3 = _mm256_fmadd_ps(_val3, _w3, _sum3); in forward() 958 _sum3 = _mm256_fmadd_ps(_val3, _w3, _sum3); in forward() 1035 _sum3 = _mm_fmadd_ps(_val3, _w3, _sum3); in forward() 1063 _sum3 = _mm_add_ps(_mm_mul_ps(_val3, _w3), _sum3); in forward() 1155 _sum3 = _mm256_fmadd_ps(_m, _w3, _sum3); in forward() 1251 _sum3 = _mm256_fmadd_ps(_m, _w3, _sum3); in forward() 1468 _sum3 = _mm256_fmadd_ps(_m, _w3, _sum3); in forward_fp16() 1533 _sum3 = _mm256_fmadd_ps(_m, _w3, _sum3); in forward_fp16() 1599 _sum3 = _mm256_fmadd_ps(_m, _w3, _sum3); in forward_fp16() [all …]
|
/dports/graphics/realsr-ncnn-vulkan/realsr-ncnn-vulkan-20210210/src/ncnn/src/layer/x86/ |
H A D | convolutiondepthwise_3x3_pack8_fp16.h | 118 _sum3 = _mm256_fmadd_ps(_k00, _r03, _sum3); in convdw3x3s1_fp16_pack8_avx() 119 _sum3 = _mm256_fmadd_ps(_k01, _r04, _sum3); in convdw3x3s1_fp16_pack8_avx() 120 _sum3 = _mm256_fmadd_ps(_k02, _r05, _sum3); in convdw3x3s1_fp16_pack8_avx() 121 _sum3 = _mm256_fmadd_ps(_k10, _r13, _sum3); in convdw3x3s1_fp16_pack8_avx() 122 _sum3 = _mm256_fmadd_ps(_k11, _r14, _sum3); in convdw3x3s1_fp16_pack8_avx() 123 _sum3 = _mm256_fmadd_ps(_k12, _r15, _sum3); in convdw3x3s1_fp16_pack8_avx() 124 _sum3 = _mm256_fmadd_ps(_k20, _r23, _sum3); in convdw3x3s1_fp16_pack8_avx() 125 _sum3 = _mm256_fmadd_ps(_k21, _r24, _sum3); in convdw3x3s1_fp16_pack8_avx() 126 _sum3 = _mm256_fmadd_ps(_k22, _r25, _sum3); in convdw3x3s1_fp16_pack8_avx() 260 _sum3 = _mm256_fmadd_ps(_k00, _r03, _sum3); in convdw3x3s1_fp16_pack8_avx() [all …]
|
H A D | convolutiondepthwise_3x3_pack8.h | 118 _sum3 = _mm256_fmadd_ps(_k00, _r03, _sum3); in convdw3x3s1_pack8_avx() 119 _sum3 = _mm256_fmadd_ps(_k01, _r04, _sum3); in convdw3x3s1_pack8_avx() 120 _sum3 = _mm256_fmadd_ps(_k02, _r05, _sum3); in convdw3x3s1_pack8_avx() 121 _sum3 = _mm256_fmadd_ps(_k10, _r13, _sum3); in convdw3x3s1_pack8_avx() 122 _sum3 = _mm256_fmadd_ps(_k11, _r14, _sum3); in convdw3x3s1_pack8_avx() 123 _sum3 = _mm256_fmadd_ps(_k12, _r15, _sum3); in convdw3x3s1_pack8_avx() 124 _sum3 = _mm256_fmadd_ps(_k20, _r23, _sum3); in convdw3x3s1_pack8_avx() 125 _sum3 = _mm256_fmadd_ps(_k21, _r24, _sum3); in convdw3x3s1_pack8_avx() 126 _sum3 = _mm256_fmadd_ps(_k22, _r25, _sum3); in convdw3x3s1_pack8_avx() 260 _sum3 = _mm256_fmadd_ps(_k00, _r03, _sum3); in convdw3x3s1_pack8_avx() [all …]
|
H A D | innerproduct_x86.cpp | 203 _sum3 = _mm256_fmadd_ps(_val, _k3, _sum3); in forward() 947 _sum3 = _mm256_fmadd_ps(_val3, _w3, _sum3); in forward() 974 _sum3 = _mm256_fmadd_ps(_val3, _w3, _sum3); in forward() 1051 _sum3 = _mm_fmadd_ps(_val3, _w3, _sum3); in forward() 1079 _sum3 = _mm_add_ps(_mm_mul_ps(_val3, _w3), _sum3); in forward() 1171 _sum3 = _mm256_fmadd_ps(_m, _w3, _sum3); in forward() 1267 _sum3 = _mm256_fmadd_ps(_m, _w3, _sum3); in forward() 1509 _sum3 = _mm256_fmadd_ps(_m, _w3, _sum3); in forward_fp16() 1574 _sum3 = _mm256_fmadd_ps(_m, _w3, _sum3); in forward_fp16() 1640 _sum3 = _mm256_fmadd_ps(_m, _w3, _sum3); in forward_fp16() [all …]
|
/dports/graphics/vapoursynth-waifu2x-ncnn-vulkan/vapoursynth-waifu2x-ncnn-vulkan-r4/deps/ncnn/src/layer/arm/ |
H A D | convolution_3x3_pack8to4_int8.h | 933 _sum3 = vmlal_lane_s16(_sum3, vget_high_s16(_w0), vget_low_s16(_val0), 1); in conv3x3s1_winograd42_pack8to4_int8_neon() 952 _sum3 = vmlal_lane_s16(_sum3, vget_high_s16(_w1), vget_low_s16(_val1), 1); in conv3x3s1_winograd42_pack8to4_int8_neon() 971 _sum3 = vmlal_lane_s16(_sum3, vget_high_s16(_w2), vget_low_s16(_val2), 1); in conv3x3s1_winograd42_pack8to4_int8_neon() 990 _sum3 = vmlal_lane_s16(_sum3, vget_high_s16(_w3), vget_low_s16(_val3), 1); in conv3x3s1_winograd42_pack8to4_int8_neon() 1009 _sum3 = vmlal_lane_s16(_sum3, vget_high_s16(_w4), vget_low_s16(_val4), 1); in conv3x3s1_winograd42_pack8to4_int8_neon() 1028 _sum3 = vmlal_lane_s16(_sum3, vget_high_s16(_w5), vget_low_s16(_val5), 1); in conv3x3s1_winograd42_pack8to4_int8_neon() 1047 _sum3 = vmlal_lane_s16(_sum3, vget_high_s16(_w6), vget_low_s16(_val6), 1); in conv3x3s1_winograd42_pack8to4_int8_neon() 1066 _sum3 = vmlal_lane_s16(_sum3, vget_high_s16(_w7), vget_low_s16(_val7), 1); in conv3x3s1_winograd42_pack8to4_int8_neon() 1137 _sum3 = vmlal_lane_s16(_sum3, vget_high_s16(_w0), vget_low_s16(_val1), 0); in conv3x3s1_winograd42_pack8to4_int8_neon() 1148 _sum3 = vmlal_lane_s16(_sum3, vget_high_s16(_w1), vget_low_s16(_val1), 1); in conv3x3s1_winograd42_pack8to4_int8_neon() [all …]
|
/dports/graphics/waifu2x-ncnn-vulkan/waifu2x-ncnn-vulkan-20210521/src/ncnn/src/layer/arm/ |
H A D | convolution_3x3_pack8to4_int8.h | 933 _sum3 = vmlal_lane_s16(_sum3, vget_high_s16(_w0), vget_low_s16(_val0), 1); in conv3x3s1_winograd42_pack8to4_int8_neon() 952 _sum3 = vmlal_lane_s16(_sum3, vget_high_s16(_w1), vget_low_s16(_val1), 1); in conv3x3s1_winograd42_pack8to4_int8_neon() 971 _sum3 = vmlal_lane_s16(_sum3, vget_high_s16(_w2), vget_low_s16(_val2), 1); in conv3x3s1_winograd42_pack8to4_int8_neon() 990 _sum3 = vmlal_lane_s16(_sum3, vget_high_s16(_w3), vget_low_s16(_val3), 1); in conv3x3s1_winograd42_pack8to4_int8_neon() 1009 _sum3 = vmlal_lane_s16(_sum3, vget_high_s16(_w4), vget_low_s16(_val4), 1); in conv3x3s1_winograd42_pack8to4_int8_neon() 1028 _sum3 = vmlal_lane_s16(_sum3, vget_high_s16(_w5), vget_low_s16(_val5), 1); in conv3x3s1_winograd42_pack8to4_int8_neon() 1047 _sum3 = vmlal_lane_s16(_sum3, vget_high_s16(_w6), vget_low_s16(_val6), 1); in conv3x3s1_winograd42_pack8to4_int8_neon() 1066 _sum3 = vmlal_lane_s16(_sum3, vget_high_s16(_w7), vget_low_s16(_val7), 1); in conv3x3s1_winograd42_pack8to4_int8_neon() 1137 _sum3 = vmlal_lane_s16(_sum3, vget_high_s16(_w0), vget_low_s16(_val1), 0); in conv3x3s1_winograd42_pack8to4_int8_neon() 1148 _sum3 = vmlal_lane_s16(_sum3, vget_high_s16(_w1), vget_low_s16(_val1), 1); in conv3x3s1_winograd42_pack8to4_int8_neon() [all …]
|
/dports/benchmarks/vkpeak/vkpeak-20210430/ncnn/src/layer/arm/ |
H A D | convolution_3x3_pack8to4_int8.h | 933 _sum3 = vmlal_lane_s16(_sum3, vget_high_s16(_w0), vget_low_s16(_val0), 1); in conv3x3s1_winograd42_pack8to4_int8_neon() 952 _sum3 = vmlal_lane_s16(_sum3, vget_high_s16(_w1), vget_low_s16(_val1), 1); in conv3x3s1_winograd42_pack8to4_int8_neon() 971 _sum3 = vmlal_lane_s16(_sum3, vget_high_s16(_w2), vget_low_s16(_val2), 1); in conv3x3s1_winograd42_pack8to4_int8_neon() 990 _sum3 = vmlal_lane_s16(_sum3, vget_high_s16(_w3), vget_low_s16(_val3), 1); in conv3x3s1_winograd42_pack8to4_int8_neon() 1009 _sum3 = vmlal_lane_s16(_sum3, vget_high_s16(_w4), vget_low_s16(_val4), 1); in conv3x3s1_winograd42_pack8to4_int8_neon() 1028 _sum3 = vmlal_lane_s16(_sum3, vget_high_s16(_w5), vget_low_s16(_val5), 1); in conv3x3s1_winograd42_pack8to4_int8_neon() 1047 _sum3 = vmlal_lane_s16(_sum3, vget_high_s16(_w6), vget_low_s16(_val6), 1); in conv3x3s1_winograd42_pack8to4_int8_neon() 1066 _sum3 = vmlal_lane_s16(_sum3, vget_high_s16(_w7), vget_low_s16(_val7), 1); in conv3x3s1_winograd42_pack8to4_int8_neon() 1137 _sum3 = vmlal_lane_s16(_sum3, vget_high_s16(_w0), vget_low_s16(_val1), 0); in conv3x3s1_winograd42_pack8to4_int8_neon() 1148 _sum3 = vmlal_lane_s16(_sum3, vget_high_s16(_w1), vget_low_s16(_val1), 1); in conv3x3s1_winograd42_pack8to4_int8_neon() [all …]
|