Home
last modified time | relevance | path

Searched refs:_r04 (Results 1 – 25 of 121) sorted by relevance

12345

/dports/graphics/realsr-ncnn-vulkan/realsr-ncnn-vulkan-20210210/src/ncnn/src/layer/x86/
H A Dconvolution_3x3_pack8.h696 _r04 = _mm256_broadcast_ss(r0 + 33); in conv3x3s1_winograd64_pack8_avx()
724 _r04 = _mm256_broadcast_ss(r0 + 34); in conv3x3s1_winograd64_pack8_avx()
752 _r04 = _mm256_broadcast_ss(r0 + 35); in conv3x3s1_winograd64_pack8_avx()
781 _r04 = _mm256_broadcast_ss(r0 + 36); in conv3x3s1_winograd64_pack8_avx()
810 _r04 = _mm256_broadcast_ss(r0 + 37); in conv3x3s1_winograd64_pack8_avx()
839 _r04 = _mm256_broadcast_ss(r0 + 38); in conv3x3s1_winograd64_pack8_avx()
867 _r04 = _mm256_broadcast_ss(r0 + 39); in conv3x3s1_winograd64_pack8_avx()
947 _r04 = _mm256_broadcast_ss(r0 + 33); in conv3x3s1_winograd64_pack8_avx()
966 _r04 = _mm256_broadcast_ss(r0 + 34); in conv3x3s1_winograd64_pack8_avx()
985 _r04 = _mm256_broadcast_ss(r0 + 35); in conv3x3s1_winograd64_pack8_avx()
[all …]
H A Dconvolution_2x2_pack8.h57 __m256 _r04 = _mm256_broadcast_ss(r0 + 4); in conv2x2s1_pack8_avx() local
80 _sum0 = _mm256_fmadd_ps(_k04, _r04, _sum0); in conv2x2s1_pack8_avx()
91 _r04 = _mm256_broadcast_ss(r0 + 4); in conv2x2s1_pack8_avx()
101 _sum1 = _mm256_fmadd_ps(_k04, _r04, _sum1); in conv2x2s1_pack8_avx()
123 _sum0 = _mm256_fmadd_ps(_k04, _r04, _sum0); in conv2x2s1_pack8_avx()
132 _r04 = _mm256_broadcast_ss(r0 + 4); in conv2x2s1_pack8_avx()
141 _sum1 = _mm256_fmadd_ps(_k04, _r04, _sum1); in conv2x2s1_pack8_avx()
251 __m256 _r04 = _mm256_broadcast_ss(r0 + 4); in conv2x2s1_pack8_avx() local
273 _sum = _mm256_fmadd_ps(_k04, _r04, _sum); in conv2x2s1_pack8_avx()
284 _r04 = _mm256_broadcast_ss(r0 + 4); in conv2x2s1_pack8_avx()
[all …]
H A Dpooling_3x3_pack8.h59 __m256 _r04 = _mm256_loadu_ps(r0 + 32); in pooling3x3s2_max_pack8_avx() local
67 __m256 _max10 = _mm256_max_ps(_r03, _r04); in pooling3x3s2_max_pack8_avx()
85 _max20 = _mm256_max_ps(_max20, _r04); in pooling3x3s2_max_pack8_avx()
137 __m256 _r04 = _mm256_loadu_ps(r0 + 32); in pooling3x3s2_max_pack8_avx() local
145 __m256 _max10 = _mm256_max_ps(_r03, _r04); in pooling3x3s2_max_pack8_avx()
H A Dconvolutiondepthwise_3x3_pack8_fp16.h97 __m256 _r04 = _mm256_loadu_ps(r0 + 32); in convdw3x3s1_fp16_pack8_avx() local
104 _sum2 = _mm256_fmadd_ps(_k02, _r04, _sum2); in convdw3x3s1_fp16_pack8_avx()
119 _sum3 = _mm256_fmadd_ps(_k01, _r04, _sum3); in convdw3x3s1_fp16_pack8_avx()
134 _sum4 = _mm256_fmadd_ps(_k00, _r04, _sum4); in convdw3x3s1_fp16_pack8_avx()
239 __m256 _r04 = _mm256_loadu_ps(r0 + 32); in convdw3x3s1_fp16_pack8_avx() local
246 _sum2 = _mm256_fmadd_ps(_k02, _r04, _sum2); in convdw3x3s1_fp16_pack8_avx()
261 _sum3 = _mm256_fmadd_ps(_k01, _r04, _sum3); in convdw3x3s1_fp16_pack8_avx()
436 __m256 _r04 = _mm256_loadu_ps(r0 + 32); in convdw3x3s2_fp16_pack8_avx() local
443 _sum1 = _mm256_fmadd_ps(_k02, _r04, _sum1); in convdw3x3s2_fp16_pack8_avx()
460 _sum2 = _mm256_fmadd_ps(_k00, _r04, _sum2); in convdw3x3s2_fp16_pack8_avx()
[all …]
H A Dconvolutiondepthwise_3x3_pack8.h97 __m256 _r04 = _mm256_loadu_ps(r0 + 32); in convdw3x3s1_pack8_avx() local
104 _sum2 = _mm256_fmadd_ps(_k02, _r04, _sum2); in convdw3x3s1_pack8_avx()
119 _sum3 = _mm256_fmadd_ps(_k01, _r04, _sum3); in convdw3x3s1_pack8_avx()
134 _sum4 = _mm256_fmadd_ps(_k00, _r04, _sum4); in convdw3x3s1_pack8_avx()
239 __m256 _r04 = _mm256_loadu_ps(r0 + 32); in convdw3x3s1_pack8_avx() local
246 _sum2 = _mm256_fmadd_ps(_k02, _r04, _sum2); in convdw3x3s1_pack8_avx()
261 _sum3 = _mm256_fmadd_ps(_k01, _r04, _sum3); in convdw3x3s1_pack8_avx()
436 __m256 _r04 = _mm256_loadu_ps(r0 + 32); in convdw3x3s2_pack8_avx() local
443 _sum1 = _mm256_fmadd_ps(_k02, _r04, _sum1); in convdw3x3s2_pack8_avx()
460 _sum2 = _mm256_fmadd_ps(_k00, _r04, _sum2); in convdw3x3s2_pack8_avx()
[all …]
/dports/graphics/vapoursynth-waifu2x-ncnn-vulkan/vapoursynth-waifu2x-ncnn-vulkan-r4/deps/ncnn/src/layer/x86/
H A Dconvolution_2x2_pack8.h57 __m256 _r04 = _mm256_broadcast_ss(r0 + 4); in conv2x2s1_pack8_avx() local
80 _sum0 = _mm256_fmadd_ps(_k04, _r04, _sum0); in conv2x2s1_pack8_avx()
91 _r04 = _mm256_broadcast_ss(r0 + 4); in conv2x2s1_pack8_avx()
101 _sum1 = _mm256_fmadd_ps(_k04, _r04, _sum1); in conv2x2s1_pack8_avx()
123 _sum0 = _mm256_fmadd_ps(_k04, _r04, _sum0); in conv2x2s1_pack8_avx()
132 _r04 = _mm256_broadcast_ss(r0 + 4); in conv2x2s1_pack8_avx()
141 _sum1 = _mm256_fmadd_ps(_k04, _r04, _sum1); in conv2x2s1_pack8_avx()
251 __m256 _r04 = _mm256_broadcast_ss(r0 + 4); in conv2x2s1_pack8_avx() local
273 _sum = _mm256_fmadd_ps(_k04, _r04, _sum); in conv2x2s1_pack8_avx()
284 _r04 = _mm256_broadcast_ss(r0 + 4); in conv2x2s1_pack8_avx()
[all …]
H A Dpooling_3x3_pack8.h59 __m256 _r04 = _mm256_loadu_ps(r0 + 32); in pooling3x3s2_max_pack8_avx() local
67 __m256 _max10 = _mm256_max_ps(_r03, _r04); in pooling3x3s2_max_pack8_avx()
85 _max20 = _mm256_max_ps(_max20, _r04); in pooling3x3s2_max_pack8_avx()
137 __m256 _r04 = _mm256_loadu_ps(r0 + 32); in pooling3x3s2_max_pack8_avx() local
145 __m256 _max10 = _mm256_max_ps(_r03, _r04); in pooling3x3s2_max_pack8_avx()
H A Dconvolutiondepthwise_3x3_pack8.h97 __m256 _r04 = _mm256_loadu_ps(r0 + 32); in convdw3x3s1_pack8_avx() local
104 _sum2 = _mm256_fmadd_ps(_k02, _r04, _sum2); in convdw3x3s1_pack8_avx()
119 _sum3 = _mm256_fmadd_ps(_k01, _r04, _sum3); in convdw3x3s1_pack8_avx()
134 _sum4 = _mm256_fmadd_ps(_k00, _r04, _sum4); in convdw3x3s1_pack8_avx()
239 __m256 _r04 = _mm256_loadu_ps(r0 + 32); in convdw3x3s1_pack8_avx() local
246 _sum2 = _mm256_fmadd_ps(_k02, _r04, _sum2); in convdw3x3s1_pack8_avx()
261 _sum3 = _mm256_fmadd_ps(_k01, _r04, _sum3); in convdw3x3s1_pack8_avx()
436 __m256 _r04 = _mm256_loadu_ps(r0 + 32); in convdw3x3s2_pack8_avx() local
443 _sum1 = _mm256_fmadd_ps(_k02, _r04, _sum1); in convdw3x3s2_pack8_avx()
460 _sum2 = _mm256_fmadd_ps(_k00, _r04, _sum2); in convdw3x3s2_pack8_avx()
[all …]
H A Dconvolutiondepthwise_3x3_pack8_fp16.h97 __m256 _r04 = _mm256_loadu_ps(r0 + 32); in convdw3x3s1_fp16_pack8_avx() local
104 _sum2 = _mm256_fmadd_ps(_k02, _r04, _sum2); in convdw3x3s1_fp16_pack8_avx()
119 _sum3 = _mm256_fmadd_ps(_k01, _r04, _sum3); in convdw3x3s1_fp16_pack8_avx()
134 _sum4 = _mm256_fmadd_ps(_k00, _r04, _sum4); in convdw3x3s1_fp16_pack8_avx()
239 __m256 _r04 = _mm256_loadu_ps(r0 + 32); in convdw3x3s1_fp16_pack8_avx() local
246 _sum2 = _mm256_fmadd_ps(_k02, _r04, _sum2); in convdw3x3s1_fp16_pack8_avx()
261 _sum3 = _mm256_fmadd_ps(_k01, _r04, _sum3); in convdw3x3s1_fp16_pack8_avx()
436 __m256 _r04 = _mm256_loadu_ps(r0 + 32); in convdw3x3s2_fp16_pack8_avx() local
443 _sum1 = _mm256_fmadd_ps(_k02, _r04, _sum1); in convdw3x3s2_fp16_pack8_avx()
460 _sum2 = _mm256_fmadd_ps(_k00, _r04, _sum2); in convdw3x3s2_fp16_pack8_avx()
[all …]
H A Dconvolution_3x3_pack8.h1401 _r04 = _mm256_broadcast_ss(r0 + 33); in conv3x3s1_winograd64_pack8_avx()
1429 _r04 = _mm256_broadcast_ss(r0 + 34); in conv3x3s1_winograd64_pack8_avx()
1457 _r04 = _mm256_broadcast_ss(r0 + 35); in conv3x3s1_winograd64_pack8_avx()
1486 _r04 = _mm256_broadcast_ss(r0 + 36); in conv3x3s1_winograd64_pack8_avx()
1515 _r04 = _mm256_broadcast_ss(r0 + 37); in conv3x3s1_winograd64_pack8_avx()
1544 _r04 = _mm256_broadcast_ss(r0 + 38); in conv3x3s1_winograd64_pack8_avx()
1572 _r04 = _mm256_broadcast_ss(r0 + 39); in conv3x3s1_winograd64_pack8_avx()
1652 _r04 = _mm256_broadcast_ss(r0 + 33); in conv3x3s1_winograd64_pack8_avx()
1671 _r04 = _mm256_broadcast_ss(r0 + 34); in conv3x3s1_winograd64_pack8_avx()
1690 _r04 = _mm256_broadcast_ss(r0 + 35); in conv3x3s1_winograd64_pack8_avx()
[all …]
/dports/graphics/waifu2x-ncnn-vulkan/waifu2x-ncnn-vulkan-20210521/src/ncnn/src/layer/x86/
H A Dconvolution_2x2_pack8.h57 __m256 _r04 = _mm256_broadcast_ss(r0 + 4); in conv2x2s1_pack8_avx() local
80 _sum0 = _mm256_fmadd_ps(_k04, _r04, _sum0); in conv2x2s1_pack8_avx()
91 _r04 = _mm256_broadcast_ss(r0 + 4); in conv2x2s1_pack8_avx()
101 _sum1 = _mm256_fmadd_ps(_k04, _r04, _sum1); in conv2x2s1_pack8_avx()
123 _sum0 = _mm256_fmadd_ps(_k04, _r04, _sum0); in conv2x2s1_pack8_avx()
132 _r04 = _mm256_broadcast_ss(r0 + 4); in conv2x2s1_pack8_avx()
141 _sum1 = _mm256_fmadd_ps(_k04, _r04, _sum1); in conv2x2s1_pack8_avx()
251 __m256 _r04 = _mm256_broadcast_ss(r0 + 4); in conv2x2s1_pack8_avx() local
273 _sum = _mm256_fmadd_ps(_k04, _r04, _sum); in conv2x2s1_pack8_avx()
284 _r04 = _mm256_broadcast_ss(r0 + 4); in conv2x2s1_pack8_avx()
[all …]
H A Dpooling_3x3_pack8.h59 __m256 _r04 = _mm256_loadu_ps(r0 + 32); in pooling3x3s2_max_pack8_avx() local
67 __m256 _max10 = _mm256_max_ps(_r03, _r04); in pooling3x3s2_max_pack8_avx()
85 _max20 = _mm256_max_ps(_max20, _r04); in pooling3x3s2_max_pack8_avx()
137 __m256 _r04 = _mm256_loadu_ps(r0 + 32); in pooling3x3s2_max_pack8_avx() local
145 __m256 _max10 = _mm256_max_ps(_r03, _r04); in pooling3x3s2_max_pack8_avx()
H A Dconvolutiondepthwise_3x3_pack8_fp16.h97 __m256 _r04 = _mm256_loadu_ps(r0 + 32); in convdw3x3s1_fp16_pack8_avx() local
104 _sum2 = _mm256_fmadd_ps(_k02, _r04, _sum2); in convdw3x3s1_fp16_pack8_avx()
119 _sum3 = _mm256_fmadd_ps(_k01, _r04, _sum3); in convdw3x3s1_fp16_pack8_avx()
134 _sum4 = _mm256_fmadd_ps(_k00, _r04, _sum4); in convdw3x3s1_fp16_pack8_avx()
239 __m256 _r04 = _mm256_loadu_ps(r0 + 32); in convdw3x3s1_fp16_pack8_avx() local
246 _sum2 = _mm256_fmadd_ps(_k02, _r04, _sum2); in convdw3x3s1_fp16_pack8_avx()
261 _sum3 = _mm256_fmadd_ps(_k01, _r04, _sum3); in convdw3x3s1_fp16_pack8_avx()
436 __m256 _r04 = _mm256_loadu_ps(r0 + 32); in convdw3x3s2_fp16_pack8_avx() local
443 _sum1 = _mm256_fmadd_ps(_k02, _r04, _sum1); in convdw3x3s2_fp16_pack8_avx()
460 _sum2 = _mm256_fmadd_ps(_k00, _r04, _sum2); in convdw3x3s2_fp16_pack8_avx()
[all …]
H A Dconvolutiondepthwise_3x3_pack8.h97 __m256 _r04 = _mm256_loadu_ps(r0 + 32); in convdw3x3s1_pack8_avx() local
104 _sum2 = _mm256_fmadd_ps(_k02, _r04, _sum2); in convdw3x3s1_pack8_avx()
119 _sum3 = _mm256_fmadd_ps(_k01, _r04, _sum3); in convdw3x3s1_pack8_avx()
134 _sum4 = _mm256_fmadd_ps(_k00, _r04, _sum4); in convdw3x3s1_pack8_avx()
239 __m256 _r04 = _mm256_loadu_ps(r0 + 32); in convdw3x3s1_pack8_avx() local
246 _sum2 = _mm256_fmadd_ps(_k02, _r04, _sum2); in convdw3x3s1_pack8_avx()
261 _sum3 = _mm256_fmadd_ps(_k01, _r04, _sum3); in convdw3x3s1_pack8_avx()
436 __m256 _r04 = _mm256_loadu_ps(r0 + 32); in convdw3x3s2_pack8_avx() local
443 _sum1 = _mm256_fmadd_ps(_k02, _r04, _sum1); in convdw3x3s2_pack8_avx()
460 _sum2 = _mm256_fmadd_ps(_k00, _r04, _sum2); in convdw3x3s2_pack8_avx()
[all …]
H A Dconvolution_3x3_pack8.h1401 _r04 = _mm256_broadcast_ss(r0 + 33); in conv3x3s1_winograd64_pack8_avx()
1429 _r04 = _mm256_broadcast_ss(r0 + 34); in conv3x3s1_winograd64_pack8_avx()
1457 _r04 = _mm256_broadcast_ss(r0 + 35); in conv3x3s1_winograd64_pack8_avx()
1486 _r04 = _mm256_broadcast_ss(r0 + 36); in conv3x3s1_winograd64_pack8_avx()
1515 _r04 = _mm256_broadcast_ss(r0 + 37); in conv3x3s1_winograd64_pack8_avx()
1544 _r04 = _mm256_broadcast_ss(r0 + 38); in conv3x3s1_winograd64_pack8_avx()
1572 _r04 = _mm256_broadcast_ss(r0 + 39); in conv3x3s1_winograd64_pack8_avx()
1652 _r04 = _mm256_broadcast_ss(r0 + 33); in conv3x3s1_winograd64_pack8_avx()
1671 _r04 = _mm256_broadcast_ss(r0 + 34); in conv3x3s1_winograd64_pack8_avx()
1690 _r04 = _mm256_broadcast_ss(r0 + 35); in conv3x3s1_winograd64_pack8_avx()
[all …]
/dports/misc/ncnn/ncnn-20211208/src/layer/x86/
H A Dconvolution_2x2_pack8.h57 __m256 _r04 = _mm256_broadcast_ss(r0 + 4); in conv2x2s1_pack8_avx() local
80 _sum0 = _mm256_comp_fmadd_ps(_k04, _r04, _sum0); in conv2x2s1_pack8_avx()
91 _r04 = _mm256_broadcast_ss(r0 + 4); in conv2x2s1_pack8_avx()
101 _sum1 = _mm256_comp_fmadd_ps(_k04, _r04, _sum1); in conv2x2s1_pack8_avx()
123 _sum0 = _mm256_comp_fmadd_ps(_k04, _r04, _sum0); in conv2x2s1_pack8_avx()
132 _r04 = _mm256_broadcast_ss(r0 + 4); in conv2x2s1_pack8_avx()
141 _sum1 = _mm256_comp_fmadd_ps(_k04, _r04, _sum1); in conv2x2s1_pack8_avx()
251 __m256 _r04 = _mm256_broadcast_ss(r0 + 4); in conv2x2s1_pack8_avx() local
273 _sum = _mm256_comp_fmadd_ps(_k04, _r04, _sum); in conv2x2s1_pack8_avx()
284 _r04 = _mm256_broadcast_ss(r0 + 4); in conv2x2s1_pack8_avx()
[all …]
H A Dpooling_3x3_pack8.h59 __m256 _r04 = _mm256_loadu_ps(r0 + 32); in pooling3x3s2_max_pack8_avx() local
67 __m256 _max10 = _mm256_max_ps(_r03, _r04); in pooling3x3s2_max_pack8_avx()
85 _max20 = _mm256_max_ps(_max20, _r04); in pooling3x3s2_max_pack8_avx()
137 __m256 _r04 = _mm256_loadu_ps(r0 + 32); in pooling3x3s2_max_pack8_avx() local
145 __m256 _max10 = _mm256_max_ps(_r03, _r04); in pooling3x3s2_max_pack8_avx()
H A Dconvolution_3x3_pack1to4.h119 __m128 _r04 = _mm_set1_ps(*(r0 + 3)); in conv3x3s1_pack1to4_sse() local
254 __m128 _r04 = _mm_set1_ps(*(r0 + 3)); in conv3x3s1_pack1to4_sse() local
403 __m128 _r04 = _mm_set1_ps(*(r0 + 3)); in conv3x3s1_pack1to4_sse() local
410 _sum1 = _mm_comp_fmadd_ps(_r04, _k02, _sum1); in conv3x3s1_pack1to4_sse()
425 _sum2 = _mm_comp_fmadd_ps(_r04, _k01, _sum2); in conv3x3s1_pack1to4_sse()
482 __m128 _r04 = _mm_set1_ps(*(r0 + 3)); in conv3x3s1_pack1to4_sse() local
653 __m128 _r04 = _mm_set1_ps(*(r0 + 3)); in conv3x3s2_pack1to4_sse() local
796 __m128 _r04 = _mm_set1_ps(*(r0 + 3)); in conv3x3s2_pack1to4_sse() local
949 __m128 _r04 = _mm_set1_ps(*(r0 + 3)); in conv3x3s2_pack1to4_sse() local
1128 __m128 _r04 = _mm_set1_ps(*(r0 + 3)); in conv3x3s2_pack1to4_sse() local
[all …]
H A Dconvolutiondepthwise_3x3_pack4.h97 __m128 _r04 = _mm_loadu_ps(r0 + 16); in convdw3x3s1_pack4_sse() local
104 _sum2 = _mm_comp_fmadd_ps(_k02, _r04, _sum2); in convdw3x3s1_pack4_sse()
119 _sum3 = _mm_comp_fmadd_ps(_k01, _r04, _sum3); in convdw3x3s1_pack4_sse()
134 _sum4 = _mm_comp_fmadd_ps(_k00, _r04, _sum4); in convdw3x3s1_pack4_sse()
239 __m128 _r04 = _mm_loadu_ps(r0 + 16); in convdw3x3s1_pack4_sse() local
246 _sum2 = _mm_comp_fmadd_ps(_k02, _r04, _sum2); in convdw3x3s1_pack4_sse()
261 _sum3 = _mm_comp_fmadd_ps(_k01, _r04, _sum3); in convdw3x3s1_pack4_sse()
436 __m128 _r04 = _mm_loadu_ps(r0 + 16); in convdw3x3s2_pack4_sse() local
443 _sum1 = _mm_comp_fmadd_ps(_k02, _r04, _sum1); in convdw3x3s2_pack4_sse()
460 _sum2 = _mm_comp_fmadd_ps(_k00, _r04, _sum2); in convdw3x3s2_pack4_sse()
[all …]
H A Dconvolutiondepthwise_3x3_pack8_fp16.h97 __m256 _r04 = _mm256_loadu_ps(r0 + 32); in convdw3x3s1_fp16_pack8_avx() local
104 _sum2 = _mm256_comp_fmadd_ps(_k02, _r04, _sum2); in convdw3x3s1_fp16_pack8_avx()
119 _sum3 = _mm256_comp_fmadd_ps(_k01, _r04, _sum3); in convdw3x3s1_fp16_pack8_avx()
134 _sum4 = _mm256_comp_fmadd_ps(_k00, _r04, _sum4); in convdw3x3s1_fp16_pack8_avx()
239 __m256 _r04 = _mm256_loadu_ps(r0 + 32); in convdw3x3s1_fp16_pack8_avx() local
246 _sum2 = _mm256_comp_fmadd_ps(_k02, _r04, _sum2); in convdw3x3s1_fp16_pack8_avx()
261 _sum3 = _mm256_comp_fmadd_ps(_k01, _r04, _sum3); in convdw3x3s1_fp16_pack8_avx()
436 __m256 _r04 = _mm256_loadu_ps(r0 + 32); in convdw3x3s2_fp16_pack8_avx() local
443 _sum1 = _mm256_comp_fmadd_ps(_k02, _r04, _sum1); in convdw3x3s2_fp16_pack8_avx()
460 _sum2 = _mm256_comp_fmadd_ps(_k00, _r04, _sum2); in convdw3x3s2_fp16_pack8_avx()
[all …]
H A Dconvolutiondepthwise_3x3_pack8.h97 __m256 _r04 = _mm256_loadu_ps(r0 + 32); in convdw3x3s1_pack8_avx() local
104 _sum2 = _mm256_comp_fmadd_ps(_k02, _r04, _sum2); in convdw3x3s1_pack8_avx()
119 _sum3 = _mm256_comp_fmadd_ps(_k01, _r04, _sum3); in convdw3x3s1_pack8_avx()
134 _sum4 = _mm256_comp_fmadd_ps(_k00, _r04, _sum4); in convdw3x3s1_pack8_avx()
239 __m256 _r04 = _mm256_loadu_ps(r0 + 32); in convdw3x3s1_pack8_avx() local
246 _sum2 = _mm256_comp_fmadd_ps(_k02, _r04, _sum2); in convdw3x3s1_pack8_avx()
261 _sum3 = _mm256_comp_fmadd_ps(_k01, _r04, _sum3); in convdw3x3s1_pack8_avx()
436 __m256 _r04 = _mm256_loadu_ps(r0 + 32); in convdw3x3s2_pack8_avx() local
443 _sum1 = _mm256_comp_fmadd_ps(_k02, _r04, _sum1); in convdw3x3s2_pack8_avx()
460 _sum2 = _mm256_comp_fmadd_ps(_k00, _r04, _sum2); in convdw3x3s2_pack8_avx()
[all …]
/dports/benchmarks/vkpeak/vkpeak-20210430/ncnn/src/layer/x86/
H A Dconvolution_2x2_pack8.h57 __m256 _r04 = _mm256_broadcast_ss(r0 + 4); in conv2x2s1_pack8_avx() local
80 _sum0 = _mm256_fmadd_ps(_k04, _r04, _sum0); in conv2x2s1_pack8_avx()
91 _r04 = _mm256_broadcast_ss(r0 + 4); in conv2x2s1_pack8_avx()
101 _sum1 = _mm256_fmadd_ps(_k04, _r04, _sum1); in conv2x2s1_pack8_avx()
123 _sum0 = _mm256_fmadd_ps(_k04, _r04, _sum0); in conv2x2s1_pack8_avx()
132 _r04 = _mm256_broadcast_ss(r0 + 4); in conv2x2s1_pack8_avx()
141 _sum1 = _mm256_fmadd_ps(_k04, _r04, _sum1); in conv2x2s1_pack8_avx()
251 __m256 _r04 = _mm256_broadcast_ss(r0 + 4); in conv2x2s1_pack8_avx() local
273 _sum = _mm256_fmadd_ps(_k04, _r04, _sum); in conv2x2s1_pack8_avx()
284 _r04 = _mm256_broadcast_ss(r0 + 4); in conv2x2s1_pack8_avx()
[all …]
H A Dpooling_3x3_pack8.h59 __m256 _r04 = _mm256_loadu_ps(r0 + 32); in pooling3x3s2_max_pack8_avx() local
67 __m256 _max10 = _mm256_max_ps(_r03, _r04); in pooling3x3s2_max_pack8_avx()
85 _max20 = _mm256_max_ps(_max20, _r04); in pooling3x3s2_max_pack8_avx()
137 __m256 _r04 = _mm256_loadu_ps(r0 + 32); in pooling3x3s2_max_pack8_avx() local
145 __m256 _max10 = _mm256_max_ps(_r03, _r04); in pooling3x3s2_max_pack8_avx()
H A Dconvolutiondepthwise_3x3_pack8.h97 __m256 _r04 = _mm256_loadu_ps(r0 + 32); in convdw3x3s1_pack8_avx() local
104 _sum2 = _mm256_fmadd_ps(_k02, _r04, _sum2); in convdw3x3s1_pack8_avx()
119 _sum3 = _mm256_fmadd_ps(_k01, _r04, _sum3); in convdw3x3s1_pack8_avx()
134 _sum4 = _mm256_fmadd_ps(_k00, _r04, _sum4); in convdw3x3s1_pack8_avx()
239 __m256 _r04 = _mm256_loadu_ps(r0 + 32); in convdw3x3s1_pack8_avx() local
246 _sum2 = _mm256_fmadd_ps(_k02, _r04, _sum2); in convdw3x3s1_pack8_avx()
261 _sum3 = _mm256_fmadd_ps(_k01, _r04, _sum3); in convdw3x3s1_pack8_avx()
436 __m256 _r04 = _mm256_loadu_ps(r0 + 32); in convdw3x3s2_pack8_avx() local
443 _sum1 = _mm256_fmadd_ps(_k02, _r04, _sum1); in convdw3x3s2_pack8_avx()
460 _sum2 = _mm256_fmadd_ps(_k00, _r04, _sum2); in convdw3x3s2_pack8_avx()
[all …]
H A Dconvolutiondepthwise_3x3_pack8_fp16.h97 __m256 _r04 = _mm256_loadu_ps(r0 + 32); in convdw3x3s1_fp16_pack8_avx() local
104 _sum2 = _mm256_fmadd_ps(_k02, _r04, _sum2); in convdw3x3s1_fp16_pack8_avx()
119 _sum3 = _mm256_fmadd_ps(_k01, _r04, _sum3); in convdw3x3s1_fp16_pack8_avx()
134 _sum4 = _mm256_fmadd_ps(_k00, _r04, _sum4); in convdw3x3s1_fp16_pack8_avx()
239 __m256 _r04 = _mm256_loadu_ps(r0 + 32); in convdw3x3s1_fp16_pack8_avx() local
246 _sum2 = _mm256_fmadd_ps(_k02, _r04, _sum2); in convdw3x3s1_fp16_pack8_avx()
261 _sum3 = _mm256_fmadd_ps(_k01, _r04, _sum3); in convdw3x3s1_fp16_pack8_avx()
436 __m256 _r04 = _mm256_loadu_ps(r0 + 32); in convdw3x3s2_fp16_pack8_avx() local
443 _sum1 = _mm256_fmadd_ps(_k02, _r04, _sum1); in convdw3x3s2_fp16_pack8_avx()
460 _sum2 = _mm256_fmadd_ps(_k00, _r04, _sum2); in convdw3x3s2_fp16_pack8_avx()
[all …]

12345