/dports/graphics/realsr-ncnn-vulkan/realsr-ncnn-vulkan-20210210/src/ncnn/src/layer/x86/ |
H A D | convolution_3x3_pack8.h | 696 _r04 = _mm256_broadcast_ss(r0 + 33); in conv3x3s1_winograd64_pack8_avx() 724 _r04 = _mm256_broadcast_ss(r0 + 34); in conv3x3s1_winograd64_pack8_avx() 752 _r04 = _mm256_broadcast_ss(r0 + 35); in conv3x3s1_winograd64_pack8_avx() 781 _r04 = _mm256_broadcast_ss(r0 + 36); in conv3x3s1_winograd64_pack8_avx() 810 _r04 = _mm256_broadcast_ss(r0 + 37); in conv3x3s1_winograd64_pack8_avx() 839 _r04 = _mm256_broadcast_ss(r0 + 38); in conv3x3s1_winograd64_pack8_avx() 867 _r04 = _mm256_broadcast_ss(r0 + 39); in conv3x3s1_winograd64_pack8_avx() 947 _r04 = _mm256_broadcast_ss(r0 + 33); in conv3x3s1_winograd64_pack8_avx() 966 _r04 = _mm256_broadcast_ss(r0 + 34); in conv3x3s1_winograd64_pack8_avx() 985 _r04 = _mm256_broadcast_ss(r0 + 35); in conv3x3s1_winograd64_pack8_avx() [all …]
|
H A D | convolution_2x2_pack8.h | 57 __m256 _r04 = _mm256_broadcast_ss(r0 + 4); in conv2x2s1_pack8_avx() local 80 _sum0 = _mm256_fmadd_ps(_k04, _r04, _sum0); in conv2x2s1_pack8_avx() 91 _r04 = _mm256_broadcast_ss(r0 + 4); in conv2x2s1_pack8_avx() 101 _sum1 = _mm256_fmadd_ps(_k04, _r04, _sum1); in conv2x2s1_pack8_avx() 123 _sum0 = _mm256_fmadd_ps(_k04, _r04, _sum0); in conv2x2s1_pack8_avx() 132 _r04 = _mm256_broadcast_ss(r0 + 4); in conv2x2s1_pack8_avx() 141 _sum1 = _mm256_fmadd_ps(_k04, _r04, _sum1); in conv2x2s1_pack8_avx() 251 __m256 _r04 = _mm256_broadcast_ss(r0 + 4); in conv2x2s1_pack8_avx() local 273 _sum = _mm256_fmadd_ps(_k04, _r04, _sum); in conv2x2s1_pack8_avx() 284 _r04 = _mm256_broadcast_ss(r0 + 4); in conv2x2s1_pack8_avx() [all …]
|
H A D | pooling_3x3_pack8.h | 59 __m256 _r04 = _mm256_loadu_ps(r0 + 32); in pooling3x3s2_max_pack8_avx() local 67 __m256 _max10 = _mm256_max_ps(_r03, _r04); in pooling3x3s2_max_pack8_avx() 85 _max20 = _mm256_max_ps(_max20, _r04); in pooling3x3s2_max_pack8_avx() 137 __m256 _r04 = _mm256_loadu_ps(r0 + 32); in pooling3x3s2_max_pack8_avx() local 145 __m256 _max10 = _mm256_max_ps(_r03, _r04); in pooling3x3s2_max_pack8_avx()
|
H A D | convolutiondepthwise_3x3_pack8_fp16.h | 97 __m256 _r04 = _mm256_loadu_ps(r0 + 32); in convdw3x3s1_fp16_pack8_avx() local 104 _sum2 = _mm256_fmadd_ps(_k02, _r04, _sum2); in convdw3x3s1_fp16_pack8_avx() 119 _sum3 = _mm256_fmadd_ps(_k01, _r04, _sum3); in convdw3x3s1_fp16_pack8_avx() 134 _sum4 = _mm256_fmadd_ps(_k00, _r04, _sum4); in convdw3x3s1_fp16_pack8_avx() 239 __m256 _r04 = _mm256_loadu_ps(r0 + 32); in convdw3x3s1_fp16_pack8_avx() local 246 _sum2 = _mm256_fmadd_ps(_k02, _r04, _sum2); in convdw3x3s1_fp16_pack8_avx() 261 _sum3 = _mm256_fmadd_ps(_k01, _r04, _sum3); in convdw3x3s1_fp16_pack8_avx() 436 __m256 _r04 = _mm256_loadu_ps(r0 + 32); in convdw3x3s2_fp16_pack8_avx() local 443 _sum1 = _mm256_fmadd_ps(_k02, _r04, _sum1); in convdw3x3s2_fp16_pack8_avx() 460 _sum2 = _mm256_fmadd_ps(_k00, _r04, _sum2); in convdw3x3s2_fp16_pack8_avx() [all …]
|
H A D | convolutiondepthwise_3x3_pack8.h | 97 __m256 _r04 = _mm256_loadu_ps(r0 + 32); in convdw3x3s1_pack8_avx() local 104 _sum2 = _mm256_fmadd_ps(_k02, _r04, _sum2); in convdw3x3s1_pack8_avx() 119 _sum3 = _mm256_fmadd_ps(_k01, _r04, _sum3); in convdw3x3s1_pack8_avx() 134 _sum4 = _mm256_fmadd_ps(_k00, _r04, _sum4); in convdw3x3s1_pack8_avx() 239 __m256 _r04 = _mm256_loadu_ps(r0 + 32); in convdw3x3s1_pack8_avx() local 246 _sum2 = _mm256_fmadd_ps(_k02, _r04, _sum2); in convdw3x3s1_pack8_avx() 261 _sum3 = _mm256_fmadd_ps(_k01, _r04, _sum3); in convdw3x3s1_pack8_avx() 436 __m256 _r04 = _mm256_loadu_ps(r0 + 32); in convdw3x3s2_pack8_avx() local 443 _sum1 = _mm256_fmadd_ps(_k02, _r04, _sum1); in convdw3x3s2_pack8_avx() 460 _sum2 = _mm256_fmadd_ps(_k00, _r04, _sum2); in convdw3x3s2_pack8_avx() [all …]
|
/dports/graphics/vapoursynth-waifu2x-ncnn-vulkan/vapoursynth-waifu2x-ncnn-vulkan-r4/deps/ncnn/src/layer/x86/ |
H A D | convolution_2x2_pack8.h | 57 __m256 _r04 = _mm256_broadcast_ss(r0 + 4); in conv2x2s1_pack8_avx() local 80 _sum0 = _mm256_fmadd_ps(_k04, _r04, _sum0); in conv2x2s1_pack8_avx() 91 _r04 = _mm256_broadcast_ss(r0 + 4); in conv2x2s1_pack8_avx() 101 _sum1 = _mm256_fmadd_ps(_k04, _r04, _sum1); in conv2x2s1_pack8_avx() 123 _sum0 = _mm256_fmadd_ps(_k04, _r04, _sum0); in conv2x2s1_pack8_avx() 132 _r04 = _mm256_broadcast_ss(r0 + 4); in conv2x2s1_pack8_avx() 141 _sum1 = _mm256_fmadd_ps(_k04, _r04, _sum1); in conv2x2s1_pack8_avx() 251 __m256 _r04 = _mm256_broadcast_ss(r0 + 4); in conv2x2s1_pack8_avx() local 273 _sum = _mm256_fmadd_ps(_k04, _r04, _sum); in conv2x2s1_pack8_avx() 284 _r04 = _mm256_broadcast_ss(r0 + 4); in conv2x2s1_pack8_avx() [all …]
|
H A D | pooling_3x3_pack8.h | 59 __m256 _r04 = _mm256_loadu_ps(r0 + 32); in pooling3x3s2_max_pack8_avx() local 67 __m256 _max10 = _mm256_max_ps(_r03, _r04); in pooling3x3s2_max_pack8_avx() 85 _max20 = _mm256_max_ps(_max20, _r04); in pooling3x3s2_max_pack8_avx() 137 __m256 _r04 = _mm256_loadu_ps(r0 + 32); in pooling3x3s2_max_pack8_avx() local 145 __m256 _max10 = _mm256_max_ps(_r03, _r04); in pooling3x3s2_max_pack8_avx()
|
H A D | convolutiondepthwise_3x3_pack8.h | 97 __m256 _r04 = _mm256_loadu_ps(r0 + 32); in convdw3x3s1_pack8_avx() local 104 _sum2 = _mm256_fmadd_ps(_k02, _r04, _sum2); in convdw3x3s1_pack8_avx() 119 _sum3 = _mm256_fmadd_ps(_k01, _r04, _sum3); in convdw3x3s1_pack8_avx() 134 _sum4 = _mm256_fmadd_ps(_k00, _r04, _sum4); in convdw3x3s1_pack8_avx() 239 __m256 _r04 = _mm256_loadu_ps(r0 + 32); in convdw3x3s1_pack8_avx() local 246 _sum2 = _mm256_fmadd_ps(_k02, _r04, _sum2); in convdw3x3s1_pack8_avx() 261 _sum3 = _mm256_fmadd_ps(_k01, _r04, _sum3); in convdw3x3s1_pack8_avx() 436 __m256 _r04 = _mm256_loadu_ps(r0 + 32); in convdw3x3s2_pack8_avx() local 443 _sum1 = _mm256_fmadd_ps(_k02, _r04, _sum1); in convdw3x3s2_pack8_avx() 460 _sum2 = _mm256_fmadd_ps(_k00, _r04, _sum2); in convdw3x3s2_pack8_avx() [all …]
|
H A D | convolutiondepthwise_3x3_pack8_fp16.h | 97 __m256 _r04 = _mm256_loadu_ps(r0 + 32); in convdw3x3s1_fp16_pack8_avx() local 104 _sum2 = _mm256_fmadd_ps(_k02, _r04, _sum2); in convdw3x3s1_fp16_pack8_avx() 119 _sum3 = _mm256_fmadd_ps(_k01, _r04, _sum3); in convdw3x3s1_fp16_pack8_avx() 134 _sum4 = _mm256_fmadd_ps(_k00, _r04, _sum4); in convdw3x3s1_fp16_pack8_avx() 239 __m256 _r04 = _mm256_loadu_ps(r0 + 32); in convdw3x3s1_fp16_pack8_avx() local 246 _sum2 = _mm256_fmadd_ps(_k02, _r04, _sum2); in convdw3x3s1_fp16_pack8_avx() 261 _sum3 = _mm256_fmadd_ps(_k01, _r04, _sum3); in convdw3x3s1_fp16_pack8_avx() 436 __m256 _r04 = _mm256_loadu_ps(r0 + 32); in convdw3x3s2_fp16_pack8_avx() local 443 _sum1 = _mm256_fmadd_ps(_k02, _r04, _sum1); in convdw3x3s2_fp16_pack8_avx() 460 _sum2 = _mm256_fmadd_ps(_k00, _r04, _sum2); in convdw3x3s2_fp16_pack8_avx() [all …]
|
H A D | convolution_3x3_pack8.h | 1401 _r04 = _mm256_broadcast_ss(r0 + 33); in conv3x3s1_winograd64_pack8_avx() 1429 _r04 = _mm256_broadcast_ss(r0 + 34); in conv3x3s1_winograd64_pack8_avx() 1457 _r04 = _mm256_broadcast_ss(r0 + 35); in conv3x3s1_winograd64_pack8_avx() 1486 _r04 = _mm256_broadcast_ss(r0 + 36); in conv3x3s1_winograd64_pack8_avx() 1515 _r04 = _mm256_broadcast_ss(r0 + 37); in conv3x3s1_winograd64_pack8_avx() 1544 _r04 = _mm256_broadcast_ss(r0 + 38); in conv3x3s1_winograd64_pack8_avx() 1572 _r04 = _mm256_broadcast_ss(r0 + 39); in conv3x3s1_winograd64_pack8_avx() 1652 _r04 = _mm256_broadcast_ss(r0 + 33); in conv3x3s1_winograd64_pack8_avx() 1671 _r04 = _mm256_broadcast_ss(r0 + 34); in conv3x3s1_winograd64_pack8_avx() 1690 _r04 = _mm256_broadcast_ss(r0 + 35); in conv3x3s1_winograd64_pack8_avx() [all …]
|
/dports/graphics/waifu2x-ncnn-vulkan/waifu2x-ncnn-vulkan-20210521/src/ncnn/src/layer/x86/ |
H A D | convolution_2x2_pack8.h | 57 __m256 _r04 = _mm256_broadcast_ss(r0 + 4); in conv2x2s1_pack8_avx() local 80 _sum0 = _mm256_fmadd_ps(_k04, _r04, _sum0); in conv2x2s1_pack8_avx() 91 _r04 = _mm256_broadcast_ss(r0 + 4); in conv2x2s1_pack8_avx() 101 _sum1 = _mm256_fmadd_ps(_k04, _r04, _sum1); in conv2x2s1_pack8_avx() 123 _sum0 = _mm256_fmadd_ps(_k04, _r04, _sum0); in conv2x2s1_pack8_avx() 132 _r04 = _mm256_broadcast_ss(r0 + 4); in conv2x2s1_pack8_avx() 141 _sum1 = _mm256_fmadd_ps(_k04, _r04, _sum1); in conv2x2s1_pack8_avx() 251 __m256 _r04 = _mm256_broadcast_ss(r0 + 4); in conv2x2s1_pack8_avx() local 273 _sum = _mm256_fmadd_ps(_k04, _r04, _sum); in conv2x2s1_pack8_avx() 284 _r04 = _mm256_broadcast_ss(r0 + 4); in conv2x2s1_pack8_avx() [all …]
|
H A D | pooling_3x3_pack8.h | 59 __m256 _r04 = _mm256_loadu_ps(r0 + 32); in pooling3x3s2_max_pack8_avx() local 67 __m256 _max10 = _mm256_max_ps(_r03, _r04); in pooling3x3s2_max_pack8_avx() 85 _max20 = _mm256_max_ps(_max20, _r04); in pooling3x3s2_max_pack8_avx() 137 __m256 _r04 = _mm256_loadu_ps(r0 + 32); in pooling3x3s2_max_pack8_avx() local 145 __m256 _max10 = _mm256_max_ps(_r03, _r04); in pooling3x3s2_max_pack8_avx()
|
H A D | convolutiondepthwise_3x3_pack8_fp16.h | 97 __m256 _r04 = _mm256_loadu_ps(r0 + 32); in convdw3x3s1_fp16_pack8_avx() local 104 _sum2 = _mm256_fmadd_ps(_k02, _r04, _sum2); in convdw3x3s1_fp16_pack8_avx() 119 _sum3 = _mm256_fmadd_ps(_k01, _r04, _sum3); in convdw3x3s1_fp16_pack8_avx() 134 _sum4 = _mm256_fmadd_ps(_k00, _r04, _sum4); in convdw3x3s1_fp16_pack8_avx() 239 __m256 _r04 = _mm256_loadu_ps(r0 + 32); in convdw3x3s1_fp16_pack8_avx() local 246 _sum2 = _mm256_fmadd_ps(_k02, _r04, _sum2); in convdw3x3s1_fp16_pack8_avx() 261 _sum3 = _mm256_fmadd_ps(_k01, _r04, _sum3); in convdw3x3s1_fp16_pack8_avx() 436 __m256 _r04 = _mm256_loadu_ps(r0 + 32); in convdw3x3s2_fp16_pack8_avx() local 443 _sum1 = _mm256_fmadd_ps(_k02, _r04, _sum1); in convdw3x3s2_fp16_pack8_avx() 460 _sum2 = _mm256_fmadd_ps(_k00, _r04, _sum2); in convdw3x3s2_fp16_pack8_avx() [all …]
|
H A D | convolutiondepthwise_3x3_pack8.h | 97 __m256 _r04 = _mm256_loadu_ps(r0 + 32); in convdw3x3s1_pack8_avx() local 104 _sum2 = _mm256_fmadd_ps(_k02, _r04, _sum2); in convdw3x3s1_pack8_avx() 119 _sum3 = _mm256_fmadd_ps(_k01, _r04, _sum3); in convdw3x3s1_pack8_avx() 134 _sum4 = _mm256_fmadd_ps(_k00, _r04, _sum4); in convdw3x3s1_pack8_avx() 239 __m256 _r04 = _mm256_loadu_ps(r0 + 32); in convdw3x3s1_pack8_avx() local 246 _sum2 = _mm256_fmadd_ps(_k02, _r04, _sum2); in convdw3x3s1_pack8_avx() 261 _sum3 = _mm256_fmadd_ps(_k01, _r04, _sum3); in convdw3x3s1_pack8_avx() 436 __m256 _r04 = _mm256_loadu_ps(r0 + 32); in convdw3x3s2_pack8_avx() local 443 _sum1 = _mm256_fmadd_ps(_k02, _r04, _sum1); in convdw3x3s2_pack8_avx() 460 _sum2 = _mm256_fmadd_ps(_k00, _r04, _sum2); in convdw3x3s2_pack8_avx() [all …]
|
H A D | convolution_3x3_pack8.h | 1401 _r04 = _mm256_broadcast_ss(r0 + 33); in conv3x3s1_winograd64_pack8_avx() 1429 _r04 = _mm256_broadcast_ss(r0 + 34); in conv3x3s1_winograd64_pack8_avx() 1457 _r04 = _mm256_broadcast_ss(r0 + 35); in conv3x3s1_winograd64_pack8_avx() 1486 _r04 = _mm256_broadcast_ss(r0 + 36); in conv3x3s1_winograd64_pack8_avx() 1515 _r04 = _mm256_broadcast_ss(r0 + 37); in conv3x3s1_winograd64_pack8_avx() 1544 _r04 = _mm256_broadcast_ss(r0 + 38); in conv3x3s1_winograd64_pack8_avx() 1572 _r04 = _mm256_broadcast_ss(r0 + 39); in conv3x3s1_winograd64_pack8_avx() 1652 _r04 = _mm256_broadcast_ss(r0 + 33); in conv3x3s1_winograd64_pack8_avx() 1671 _r04 = _mm256_broadcast_ss(r0 + 34); in conv3x3s1_winograd64_pack8_avx() 1690 _r04 = _mm256_broadcast_ss(r0 + 35); in conv3x3s1_winograd64_pack8_avx() [all …]
|
/dports/misc/ncnn/ncnn-20211208/src/layer/x86/ |
H A D | convolution_2x2_pack8.h | 57 __m256 _r04 = _mm256_broadcast_ss(r0 + 4); in conv2x2s1_pack8_avx() local 80 _sum0 = _mm256_comp_fmadd_ps(_k04, _r04, _sum0); in conv2x2s1_pack8_avx() 91 _r04 = _mm256_broadcast_ss(r0 + 4); in conv2x2s1_pack8_avx() 101 _sum1 = _mm256_comp_fmadd_ps(_k04, _r04, _sum1); in conv2x2s1_pack8_avx() 123 _sum0 = _mm256_comp_fmadd_ps(_k04, _r04, _sum0); in conv2x2s1_pack8_avx() 132 _r04 = _mm256_broadcast_ss(r0 + 4); in conv2x2s1_pack8_avx() 141 _sum1 = _mm256_comp_fmadd_ps(_k04, _r04, _sum1); in conv2x2s1_pack8_avx() 251 __m256 _r04 = _mm256_broadcast_ss(r0 + 4); in conv2x2s1_pack8_avx() local 273 _sum = _mm256_comp_fmadd_ps(_k04, _r04, _sum); in conv2x2s1_pack8_avx() 284 _r04 = _mm256_broadcast_ss(r0 + 4); in conv2x2s1_pack8_avx() [all …]
|
H A D | pooling_3x3_pack8.h | 59 __m256 _r04 = _mm256_loadu_ps(r0 + 32); in pooling3x3s2_max_pack8_avx() local 67 __m256 _max10 = _mm256_max_ps(_r03, _r04); in pooling3x3s2_max_pack8_avx() 85 _max20 = _mm256_max_ps(_max20, _r04); in pooling3x3s2_max_pack8_avx() 137 __m256 _r04 = _mm256_loadu_ps(r0 + 32); in pooling3x3s2_max_pack8_avx() local 145 __m256 _max10 = _mm256_max_ps(_r03, _r04); in pooling3x3s2_max_pack8_avx()
|
H A D | convolution_3x3_pack1to4.h | 119 __m128 _r04 = _mm_set1_ps(*(r0 + 3)); in conv3x3s1_pack1to4_sse() local 254 __m128 _r04 = _mm_set1_ps(*(r0 + 3)); in conv3x3s1_pack1to4_sse() local 403 __m128 _r04 = _mm_set1_ps(*(r0 + 3)); in conv3x3s1_pack1to4_sse() local 410 _sum1 = _mm_comp_fmadd_ps(_r04, _k02, _sum1); in conv3x3s1_pack1to4_sse() 425 _sum2 = _mm_comp_fmadd_ps(_r04, _k01, _sum2); in conv3x3s1_pack1to4_sse() 482 __m128 _r04 = _mm_set1_ps(*(r0 + 3)); in conv3x3s1_pack1to4_sse() local 653 __m128 _r04 = _mm_set1_ps(*(r0 + 3)); in conv3x3s2_pack1to4_sse() local 796 __m128 _r04 = _mm_set1_ps(*(r0 + 3)); in conv3x3s2_pack1to4_sse() local 949 __m128 _r04 = _mm_set1_ps(*(r0 + 3)); in conv3x3s2_pack1to4_sse() local 1128 __m128 _r04 = _mm_set1_ps(*(r0 + 3)); in conv3x3s2_pack1to4_sse() local [all …]
|
H A D | convolutiondepthwise_3x3_pack4.h | 97 __m128 _r04 = _mm_loadu_ps(r0 + 16); in convdw3x3s1_pack4_sse() local 104 _sum2 = _mm_comp_fmadd_ps(_k02, _r04, _sum2); in convdw3x3s1_pack4_sse() 119 _sum3 = _mm_comp_fmadd_ps(_k01, _r04, _sum3); in convdw3x3s1_pack4_sse() 134 _sum4 = _mm_comp_fmadd_ps(_k00, _r04, _sum4); in convdw3x3s1_pack4_sse() 239 __m128 _r04 = _mm_loadu_ps(r0 + 16); in convdw3x3s1_pack4_sse() local 246 _sum2 = _mm_comp_fmadd_ps(_k02, _r04, _sum2); in convdw3x3s1_pack4_sse() 261 _sum3 = _mm_comp_fmadd_ps(_k01, _r04, _sum3); in convdw3x3s1_pack4_sse() 436 __m128 _r04 = _mm_loadu_ps(r0 + 16); in convdw3x3s2_pack4_sse() local 443 _sum1 = _mm_comp_fmadd_ps(_k02, _r04, _sum1); in convdw3x3s2_pack4_sse() 460 _sum2 = _mm_comp_fmadd_ps(_k00, _r04, _sum2); in convdw3x3s2_pack4_sse() [all …]
|
H A D | convolutiondepthwise_3x3_pack8_fp16.h | 97 __m256 _r04 = _mm256_loadu_ps(r0 + 32); in convdw3x3s1_fp16_pack8_avx() local 104 _sum2 = _mm256_comp_fmadd_ps(_k02, _r04, _sum2); in convdw3x3s1_fp16_pack8_avx() 119 _sum3 = _mm256_comp_fmadd_ps(_k01, _r04, _sum3); in convdw3x3s1_fp16_pack8_avx() 134 _sum4 = _mm256_comp_fmadd_ps(_k00, _r04, _sum4); in convdw3x3s1_fp16_pack8_avx() 239 __m256 _r04 = _mm256_loadu_ps(r0 + 32); in convdw3x3s1_fp16_pack8_avx() local 246 _sum2 = _mm256_comp_fmadd_ps(_k02, _r04, _sum2); in convdw3x3s1_fp16_pack8_avx() 261 _sum3 = _mm256_comp_fmadd_ps(_k01, _r04, _sum3); in convdw3x3s1_fp16_pack8_avx() 436 __m256 _r04 = _mm256_loadu_ps(r0 + 32); in convdw3x3s2_fp16_pack8_avx() local 443 _sum1 = _mm256_comp_fmadd_ps(_k02, _r04, _sum1); in convdw3x3s2_fp16_pack8_avx() 460 _sum2 = _mm256_comp_fmadd_ps(_k00, _r04, _sum2); in convdw3x3s2_fp16_pack8_avx() [all …]
|
H A D | convolutiondepthwise_3x3_pack8.h | 97 __m256 _r04 = _mm256_loadu_ps(r0 + 32); in convdw3x3s1_pack8_avx() local 104 _sum2 = _mm256_comp_fmadd_ps(_k02, _r04, _sum2); in convdw3x3s1_pack8_avx() 119 _sum3 = _mm256_comp_fmadd_ps(_k01, _r04, _sum3); in convdw3x3s1_pack8_avx() 134 _sum4 = _mm256_comp_fmadd_ps(_k00, _r04, _sum4); in convdw3x3s1_pack8_avx() 239 __m256 _r04 = _mm256_loadu_ps(r0 + 32); in convdw3x3s1_pack8_avx() local 246 _sum2 = _mm256_comp_fmadd_ps(_k02, _r04, _sum2); in convdw3x3s1_pack8_avx() 261 _sum3 = _mm256_comp_fmadd_ps(_k01, _r04, _sum3); in convdw3x3s1_pack8_avx() 436 __m256 _r04 = _mm256_loadu_ps(r0 + 32); in convdw3x3s2_pack8_avx() local 443 _sum1 = _mm256_comp_fmadd_ps(_k02, _r04, _sum1); in convdw3x3s2_pack8_avx() 460 _sum2 = _mm256_comp_fmadd_ps(_k00, _r04, _sum2); in convdw3x3s2_pack8_avx() [all …]
|
/dports/benchmarks/vkpeak/vkpeak-20210430/ncnn/src/layer/x86/ |
H A D | convolution_2x2_pack8.h | 57 __m256 _r04 = _mm256_broadcast_ss(r0 + 4); in conv2x2s1_pack8_avx() local 80 _sum0 = _mm256_fmadd_ps(_k04, _r04, _sum0); in conv2x2s1_pack8_avx() 91 _r04 = _mm256_broadcast_ss(r0 + 4); in conv2x2s1_pack8_avx() 101 _sum1 = _mm256_fmadd_ps(_k04, _r04, _sum1); in conv2x2s1_pack8_avx() 123 _sum0 = _mm256_fmadd_ps(_k04, _r04, _sum0); in conv2x2s1_pack8_avx() 132 _r04 = _mm256_broadcast_ss(r0 + 4); in conv2x2s1_pack8_avx() 141 _sum1 = _mm256_fmadd_ps(_k04, _r04, _sum1); in conv2x2s1_pack8_avx() 251 __m256 _r04 = _mm256_broadcast_ss(r0 + 4); in conv2x2s1_pack8_avx() local 273 _sum = _mm256_fmadd_ps(_k04, _r04, _sum); in conv2x2s1_pack8_avx() 284 _r04 = _mm256_broadcast_ss(r0 + 4); in conv2x2s1_pack8_avx() [all …]
|
H A D | pooling_3x3_pack8.h | 59 __m256 _r04 = _mm256_loadu_ps(r0 + 32); in pooling3x3s2_max_pack8_avx() local 67 __m256 _max10 = _mm256_max_ps(_r03, _r04); in pooling3x3s2_max_pack8_avx() 85 _max20 = _mm256_max_ps(_max20, _r04); in pooling3x3s2_max_pack8_avx() 137 __m256 _r04 = _mm256_loadu_ps(r0 + 32); in pooling3x3s2_max_pack8_avx() local 145 __m256 _max10 = _mm256_max_ps(_r03, _r04); in pooling3x3s2_max_pack8_avx()
|
H A D | convolutiondepthwise_3x3_pack8.h | 97 __m256 _r04 = _mm256_loadu_ps(r0 + 32); in convdw3x3s1_pack8_avx() local 104 _sum2 = _mm256_fmadd_ps(_k02, _r04, _sum2); in convdw3x3s1_pack8_avx() 119 _sum3 = _mm256_fmadd_ps(_k01, _r04, _sum3); in convdw3x3s1_pack8_avx() 134 _sum4 = _mm256_fmadd_ps(_k00, _r04, _sum4); in convdw3x3s1_pack8_avx() 239 __m256 _r04 = _mm256_loadu_ps(r0 + 32); in convdw3x3s1_pack8_avx() local 246 _sum2 = _mm256_fmadd_ps(_k02, _r04, _sum2); in convdw3x3s1_pack8_avx() 261 _sum3 = _mm256_fmadd_ps(_k01, _r04, _sum3); in convdw3x3s1_pack8_avx() 436 __m256 _r04 = _mm256_loadu_ps(r0 + 32); in convdw3x3s2_pack8_avx() local 443 _sum1 = _mm256_fmadd_ps(_k02, _r04, _sum1); in convdw3x3s2_pack8_avx() 460 _sum2 = _mm256_fmadd_ps(_k00, _r04, _sum2); in convdw3x3s2_pack8_avx() [all …]
|
H A D | convolutiondepthwise_3x3_pack8_fp16.h | 97 __m256 _r04 = _mm256_loadu_ps(r0 + 32); in convdw3x3s1_fp16_pack8_avx() local 104 _sum2 = _mm256_fmadd_ps(_k02, _r04, _sum2); in convdw3x3s1_fp16_pack8_avx() 119 _sum3 = _mm256_fmadd_ps(_k01, _r04, _sum3); in convdw3x3s1_fp16_pack8_avx() 134 _sum4 = _mm256_fmadd_ps(_k00, _r04, _sum4); in convdw3x3s1_fp16_pack8_avx() 239 __m256 _r04 = _mm256_loadu_ps(r0 + 32); in convdw3x3s1_fp16_pack8_avx() local 246 _sum2 = _mm256_fmadd_ps(_k02, _r04, _sum2); in convdw3x3s1_fp16_pack8_avx() 261 _sum3 = _mm256_fmadd_ps(_k01, _r04, _sum3); in convdw3x3s1_fp16_pack8_avx() 436 __m256 _r04 = _mm256_loadu_ps(r0 + 32); in convdw3x3s2_fp16_pack8_avx() local 443 _sum1 = _mm256_fmadd_ps(_k02, _r04, _sum1); in convdw3x3s2_fp16_pack8_avx() 460 _sum2 = _mm256_fmadd_ps(_k00, _r04, _sum2); in convdw3x3s2_fp16_pack8_avx() [all …]
|