/dports/graphics/vapoursynth-waifu2x-ncnn-vulkan/vapoursynth-waifu2x-ncnn-vulkan-r4/deps/ncnn/src/layer/x86/ |
H A D | convolution_3x3.h | 389 __m256 _sum1n = _mm256_broadcast_ss(&zero_val); in conv3x3s1_winograd23_sse() local 423 _sum1n = _mm256_fmadd_ps(_r0n, _k1n, _sum1n); in conv3x3s1_winograd23_sse() 443 _sum1n = _mm256_fmadd_ps(_r0n, _k1n, _sum1n); in conv3x3s1_winograd23_sse() 462 _sum1n = _mm256_fmadd_ps(_r0n, _k1n, _sum1n); in conv3x3s1_winograd23_sse() 481 _sum1n = _mm256_fmadd_ps(_r0n, _k1n, _sum1n); in conv3x3s1_winograd23_sse() 511 _sum1n = _mm256_fmadd_ps(_r0n, _k1n, _sum1n); in conv3x3s1_winograd23_sse() 521 _mm256_storeu_ps(output1_tm + 8, _sum1n); in conv3x3s1_winograd23_sse()
|
/dports/graphics/waifu2x-ncnn-vulkan/waifu2x-ncnn-vulkan-20210521/src/ncnn/src/layer/x86/ |
H A D | convolution_3x3.h | 389 __m256 _sum1n = _mm256_broadcast_ss(&zero_val); in conv3x3s1_winograd23_sse() local 423 _sum1n = _mm256_fmadd_ps(_r0n, _k1n, _sum1n); in conv3x3s1_winograd23_sse() 443 _sum1n = _mm256_fmadd_ps(_r0n, _k1n, _sum1n); in conv3x3s1_winograd23_sse() 462 _sum1n = _mm256_fmadd_ps(_r0n, _k1n, _sum1n); in conv3x3s1_winograd23_sse() 481 _sum1n = _mm256_fmadd_ps(_r0n, _k1n, _sum1n); in conv3x3s1_winograd23_sse() 511 _sum1n = _mm256_fmadd_ps(_r0n, _k1n, _sum1n); in conv3x3s1_winograd23_sse() 521 _mm256_storeu_ps(output1_tm + 8, _sum1n); in conv3x3s1_winograd23_sse()
|
/dports/misc/ncnn/ncnn-20211208/src/layer/x86/ |
H A D | convolution_3x3.h | 389 __m256 _sum1n = _mm256_broadcast_ss(&zero_val); in conv3x3s1_winograd23_sse() local 423 _sum1n = _mm256_comp_fmadd_ps(_r0n, _k1n, _sum1n); in conv3x3s1_winograd23_sse() 443 _sum1n = _mm256_comp_fmadd_ps(_r0n, _k1n, _sum1n); in conv3x3s1_winograd23_sse() 462 _sum1n = _mm256_comp_fmadd_ps(_r0n, _k1n, _sum1n); in conv3x3s1_winograd23_sse() 481 _sum1n = _mm256_comp_fmadd_ps(_r0n, _k1n, _sum1n); in conv3x3s1_winograd23_sse() 511 _sum1n = _mm256_comp_fmadd_ps(_r0n, _k1n, _sum1n); in conv3x3s1_winograd23_sse() 521 _mm256_storeu_ps(output1_tm + 8, _sum1n); in conv3x3s1_winograd23_sse()
|
/dports/benchmarks/vkpeak/vkpeak-20210430/ncnn/src/layer/x86/ |
H A D | convolution_3x3.h | 389 __m256 _sum1n = _mm256_broadcast_ss(&zero_val); in conv3x3s1_winograd23_sse() local 423 _sum1n = _mm256_fmadd_ps(_r0n, _k1n, _sum1n); in conv3x3s1_winograd23_sse() 443 _sum1n = _mm256_fmadd_ps(_r0n, _k1n, _sum1n); in conv3x3s1_winograd23_sse() 462 _sum1n = _mm256_fmadd_ps(_r0n, _k1n, _sum1n); in conv3x3s1_winograd23_sse() 481 _sum1n = _mm256_fmadd_ps(_r0n, _k1n, _sum1n); in conv3x3s1_winograd23_sse() 511 _sum1n = _mm256_fmadd_ps(_r0n, _k1n, _sum1n); in conv3x3s1_winograd23_sse() 521 _mm256_storeu_ps(output1_tm + 8, _sum1n); in conv3x3s1_winograd23_sse()
|
/dports/graphics/realsr-ncnn-vulkan/realsr-ncnn-vulkan-20210210/src/ncnn/src/layer/x86/ |
H A D | convolution_3x3.h | 389 __m256 _sum1n = _mm256_broadcast_ss(&zero_val); in conv3x3s1_winograd23_sse() local 423 _sum1n = _mm256_fmadd_ps(_r0n, _k1n, _sum1n); in conv3x3s1_winograd23_sse() 443 _sum1n = _mm256_fmadd_ps(_r0n, _k1n, _sum1n); in conv3x3s1_winograd23_sse() 462 _sum1n = _mm256_fmadd_ps(_r0n, _k1n, _sum1n); in conv3x3s1_winograd23_sse() 481 _sum1n = _mm256_fmadd_ps(_r0n, _k1n, _sum1n); in conv3x3s1_winograd23_sse() 511 _sum1n = _mm256_fmadd_ps(_r0n, _k1n, _sum1n); in conv3x3s1_winograd23_sse() 521 _mm256_storeu_ps(output1_tm + 8, _sum1n); in conv3x3s1_winograd23_sse()
|
/dports/graphics/waifu2x-ncnn-vulkan/waifu2x-ncnn-vulkan-20210521/src/ncnn/src/layer/arm/ |
H A D | convolution_3x3.h | 388 float32x4_t _sum1n = vmulq_f32(_r10, _k10); in conv3x3s1_neon() local 390 _sum1n = vmlaq_f32(_sum1n, _r20, _k13); in conv3x3s1_neon() 392 _sum1n = vmlaq_f32(_sum1n, _r30, _k16); in conv3x3s1_neon() 397 _sum1n = vsetq_lane_f32(*outptr1n, _sum1n, 3); in conv3x3s1_neon() 402 *outptr1n = vaddvq_f32(_sum1n); in conv3x3s1_neon() 407 float32x2_t _ss1n = vadd_f32(vget_low_f32(_sum1n), vget_high_f32(_sum1n)); in conv3x3s1_neon()
|
H A D | convolution_3x3_int8.h | 4162 int32x4_t _sum1n = vmull_laneq_s16(vget_high_s16(_k1), _r0, 1); in conv3x3s2_packed_int8_neon() local 4178 _sum1n = vmlal_laneq_s16(_sum1n, vget_high_s16(_k1), _r1, 1); in conv3x3s2_packed_int8_neon() 4194 _sum1n = vmlal_laneq_s16(_sum1n, vget_high_s16(_k1), _r2, 1); in conv3x3s2_packed_int8_neon() 4199 _sum0n = vaddq_s32(_sum0n, _sum1n); in conv3x3s2_packed_int8_neon()
|
/dports/benchmarks/vkpeak/vkpeak-20210430/ncnn/src/layer/arm/ |
H A D | convolution_3x3.h | 388 float32x4_t _sum1n = vmulq_f32(_r10, _k10); in conv3x3s1_neon() local 390 _sum1n = vmlaq_f32(_sum1n, _r20, _k13); in conv3x3s1_neon() 392 _sum1n = vmlaq_f32(_sum1n, _r30, _k16); in conv3x3s1_neon() 397 _sum1n = vsetq_lane_f32(*outptr1n, _sum1n, 3); in conv3x3s1_neon() 402 *outptr1n = vaddvq_f32(_sum1n); in conv3x3s1_neon() 407 float32x2_t _ss1n = vadd_f32(vget_low_f32(_sum1n), vget_high_f32(_sum1n)); in conv3x3s1_neon()
|
H A D | convolution_3x3_int8.h | 4162 int32x4_t _sum1n = vmull_laneq_s16(vget_high_s16(_k1), _r0, 1); in conv3x3s2_packed_int8_neon() local 4178 _sum1n = vmlal_laneq_s16(_sum1n, vget_high_s16(_k1), _r1, 1); in conv3x3s2_packed_int8_neon() 4194 _sum1n = vmlal_laneq_s16(_sum1n, vget_high_s16(_k1), _r2, 1); in conv3x3s2_packed_int8_neon() 4199 _sum0n = vaddq_s32(_sum0n, _sum1n); in conv3x3s2_packed_int8_neon()
|
/dports/misc/ncnn/ncnn-20211208/src/layer/arm/ |
H A D | convolution_3x3.h | 388 float32x4_t _sum1n = vmulq_f32(_r10, _k10); in conv3x3s1_neon() local 390 _sum1n = vmlaq_f32(_sum1n, _r20, _k13); in conv3x3s1_neon() 392 _sum1n = vmlaq_f32(_sum1n, _r30, _k16); in conv3x3s1_neon() 397 _sum1n = vsetq_lane_f32(*outptr1n, _sum1n, 3); in conv3x3s1_neon() 402 *outptr1n = vaddvq_f32(_sum1n); in conv3x3s1_neon() 407 float32x2_t _ss1n = vadd_f32(vget_low_f32(_sum1n), vget_high_f32(_sum1n)); in conv3x3s1_neon()
|
H A D | convolution_3x3_int8.h | 4162 int32x4_t _sum1n = vmull_laneq_s16(vget_high_s16(_k1), _r0, 1); in conv3x3s2_packed_int8_neon() local 4178 _sum1n = vmlal_laneq_s16(_sum1n, vget_high_s16(_k1), _r1, 1); in conv3x3s2_packed_int8_neon() 4194 _sum1n = vmlal_laneq_s16(_sum1n, vget_high_s16(_k1), _r2, 1); in conv3x3s2_packed_int8_neon() 4199 _sum0n = vaddq_s32(_sum0n, _sum1n); in conv3x3s2_packed_int8_neon()
|
/dports/graphics/vapoursynth-waifu2x-ncnn-vulkan/vapoursynth-waifu2x-ncnn-vulkan-r4/deps/ncnn/src/layer/arm/ |
H A D | convolution_3x3.h | 388 float32x4_t _sum1n = vmulq_f32(_r10, _k10); in conv3x3s1_neon() local 390 _sum1n = vmlaq_f32(_sum1n, _r20, _k13); in conv3x3s1_neon() 392 _sum1n = vmlaq_f32(_sum1n, _r30, _k16); in conv3x3s1_neon() 397 _sum1n = vsetq_lane_f32(*outptr1n, _sum1n, 3); in conv3x3s1_neon() 402 *outptr1n = vaddvq_f32(_sum1n); in conv3x3s1_neon() 407 float32x2_t _ss1n = vadd_f32(vget_low_f32(_sum1n), vget_high_f32(_sum1n)); in conv3x3s1_neon()
|
H A D | convolution_3x3_int8.h | 4162 int32x4_t _sum1n = vmull_laneq_s16(vget_high_s16(_k1), _r0, 1); in conv3x3s2_packed_int8_neon() local 4178 _sum1n = vmlal_laneq_s16(_sum1n, vget_high_s16(_k1), _r1, 1); in conv3x3s2_packed_int8_neon() 4194 _sum1n = vmlal_laneq_s16(_sum1n, vget_high_s16(_k1), _r2, 1); in conv3x3s2_packed_int8_neon() 4199 _sum0n = vaddq_s32(_sum0n, _sum1n); in conv3x3s2_packed_int8_neon()
|
/dports/graphics/realsr-ncnn-vulkan/realsr-ncnn-vulkan-20210210/src/ncnn/src/layer/arm/ |
H A D | convolution_3x3.h | 388 float32x4_t _sum1n = vmulq_f32(_r10, _k10); in conv3x3s1_neon() local 390 _sum1n = vmlaq_f32(_sum1n, _r20, _k13); in conv3x3s1_neon() 392 _sum1n = vmlaq_f32(_sum1n, _r30, _k16); in conv3x3s1_neon() 397 _sum1n = vsetq_lane_f32(*outptr1n, _sum1n, 3); in conv3x3s1_neon() 402 *outptr1n = vaddvq_f32(_sum1n); in conv3x3s1_neon() 407 float32x2_t _ss1n = vadd_f32(vget_low_f32(_sum1n), vget_high_f32(_sum1n)); in conv3x3s1_neon()
|
H A D | convolution_3x3_int8.h | 4162 int32x4_t _sum1n = vmull_laneq_s16(vget_high_s16(_k1), _r0, 1); in conv3x3s2_packed_int8_neon() local 4178 _sum1n = vmlal_laneq_s16(_sum1n, vget_high_s16(_k1), _r1, 1); in conv3x3s2_packed_int8_neon() 4194 _sum1n = vmlal_laneq_s16(_sum1n, vget_high_s16(_k1), _r2, 1); in conv3x3s2_packed_int8_neon() 4199 _sum0n = vaddq_s32(_sum0n, _sum1n); in conv3x3s2_packed_int8_neon()
|