Lines Matching refs:va

245                 const float* va = kernel_tm.channel(i / 8);  in conv_im2col_sgemm_sse()  local
260 __m256 _va0 = _mm256_broadcast_ss(va); in conv_im2col_sgemm_sse()
261 __m256 _va1 = _mm256_broadcast_ss(va + 1); in conv_im2col_sgemm_sse()
262 __m256 _va2 = _mm256_broadcast_ss(va + 2); in conv_im2col_sgemm_sse()
263 __m256 _va3 = _mm256_broadcast_ss(va + 3); in conv_im2col_sgemm_sse()
272 _va0 = _mm256_broadcast_ss(va + 4); in conv_im2col_sgemm_sse()
273 _va1 = _mm256_broadcast_ss(va + 5); in conv_im2col_sgemm_sse()
274 _va2 = _mm256_broadcast_ss(va + 6); in conv_im2col_sgemm_sse()
275 _va3 = _mm256_broadcast_ss(va + 7); in conv_im2col_sgemm_sse()
281 va += 8; in conv_im2col_sgemm_sse()
284 _va0 = _mm256_broadcast_ss(va); in conv_im2col_sgemm_sse()
285 _va1 = _mm256_broadcast_ss(va + 1); in conv_im2col_sgemm_sse()
286 _va2 = _mm256_broadcast_ss(va + 2); in conv_im2col_sgemm_sse()
287 _va3 = _mm256_broadcast_ss(va + 3); in conv_im2col_sgemm_sse()
292 _va0 = _mm256_broadcast_ss(va + 4); in conv_im2col_sgemm_sse()
293 _va1 = _mm256_broadcast_ss(va + 5); in conv_im2col_sgemm_sse()
294 _va2 = _mm256_broadcast_ss(va + 6); in conv_im2col_sgemm_sse()
295 _va3 = _mm256_broadcast_ss(va + 7); in conv_im2col_sgemm_sse()
301 va += 8; in conv_im2col_sgemm_sse()
304 _va0 = _mm256_broadcast_ss(va); in conv_im2col_sgemm_sse()
305 _va1 = _mm256_broadcast_ss(va + 1); in conv_im2col_sgemm_sse()
306 _va2 = _mm256_broadcast_ss(va + 2); in conv_im2col_sgemm_sse()
307 _va3 = _mm256_broadcast_ss(va + 3); in conv_im2col_sgemm_sse()
312 _va0 = _mm256_broadcast_ss(va + 4); in conv_im2col_sgemm_sse()
313 _va1 = _mm256_broadcast_ss(va + 5); in conv_im2col_sgemm_sse()
314 _va2 = _mm256_broadcast_ss(va + 6); in conv_im2col_sgemm_sse()
315 _va3 = _mm256_broadcast_ss(va + 7); in conv_im2col_sgemm_sse()
321 va += 8; in conv_im2col_sgemm_sse()
324 _va0 = _mm256_broadcast_ss(va); in conv_im2col_sgemm_sse()
325 _va1 = _mm256_broadcast_ss(va + 1); in conv_im2col_sgemm_sse()
326 _va2 = _mm256_broadcast_ss(va + 2); in conv_im2col_sgemm_sse()
327 _va3 = _mm256_broadcast_ss(va + 3); in conv_im2col_sgemm_sse()
332 _va0 = _mm256_broadcast_ss(va + 4); in conv_im2col_sgemm_sse()
333 _va1 = _mm256_broadcast_ss(va + 5); in conv_im2col_sgemm_sse()
334 _va2 = _mm256_broadcast_ss(va + 6); in conv_im2col_sgemm_sse()
335 _va3 = _mm256_broadcast_ss(va + 7); in conv_im2col_sgemm_sse()
341 va += 8; in conv_im2col_sgemm_sse()
348 __m256 _va0 = _mm256_broadcast_ss(va); in conv_im2col_sgemm_sse()
349 __m256 _va1 = _mm256_broadcast_ss(va + 1); in conv_im2col_sgemm_sse()
350 __m256 _va2 = _mm256_broadcast_ss(va + 2); in conv_im2col_sgemm_sse()
351 __m256 _va3 = _mm256_broadcast_ss(va + 3); in conv_im2col_sgemm_sse()
352 __m256 _va4 = _mm256_broadcast_ss(va + 4); in conv_im2col_sgemm_sse()
353 __m256 _va5 = _mm256_broadcast_ss(va + 5); in conv_im2col_sgemm_sse()
354 __m256 _va6 = _mm256_broadcast_ss(va + 6); in conv_im2col_sgemm_sse()
355 __m256 _va7 = _mm256_broadcast_ss(va + 7); in conv_im2col_sgemm_sse()
366 va += 8; in conv_im2col_sgemm_sse()
393 sum0[n] += va[0] * vb[n]; in conv_im2col_sgemm_sse()
394 sum1[n] += va[1] * vb[n]; in conv_im2col_sgemm_sse()
395 sum2[n] += va[2] * vb[n]; in conv_im2col_sgemm_sse()
396 sum3[n] += va[3] * vb[n]; in conv_im2col_sgemm_sse()
397 sum4[n] += va[4] * vb[n]; in conv_im2col_sgemm_sse()
398 sum5[n] += va[5] * vb[n]; in conv_im2col_sgemm_sse()
399 sum6[n] += va[6] * vb[n]; in conv_im2col_sgemm_sse()
400 sum7[n] += va[7] * vb[n]; in conv_im2col_sgemm_sse()
401 va += 8; in conv_im2col_sgemm_sse()
403 sum0[n] += va[0] * vb[n + 8]; in conv_im2col_sgemm_sse()
404 sum1[n] += va[1] * vb[n + 8]; in conv_im2col_sgemm_sse()
405 sum2[n] += va[2] * vb[n + 8]; in conv_im2col_sgemm_sse()
406 sum3[n] += va[3] * vb[n + 8]; in conv_im2col_sgemm_sse()
407 sum4[n] += va[4] * vb[n + 8]; in conv_im2col_sgemm_sse()
408 sum5[n] += va[5] * vb[n + 8]; in conv_im2col_sgemm_sse()
409 sum6[n] += va[6] * vb[n + 8]; in conv_im2col_sgemm_sse()
410 sum7[n] += va[7] * vb[n + 8]; in conv_im2col_sgemm_sse()
411 va += 8; in conv_im2col_sgemm_sse()
413 sum0[n] += va[0] * vb[n + 16]; in conv_im2col_sgemm_sse()
414 sum1[n] += va[1] * vb[n + 16]; in conv_im2col_sgemm_sse()
415 sum2[n] += va[2] * vb[n + 16]; in conv_im2col_sgemm_sse()
416 sum3[n] += va[3] * vb[n + 16]; in conv_im2col_sgemm_sse()
417 sum4[n] += va[4] * vb[n + 16]; in conv_im2col_sgemm_sse()
418 sum5[n] += va[5] * vb[n + 16]; in conv_im2col_sgemm_sse()
419 sum6[n] += va[6] * vb[n + 16]; in conv_im2col_sgemm_sse()
420 sum7[n] += va[7] * vb[n + 16]; in conv_im2col_sgemm_sse()
421 va += 8; in conv_im2col_sgemm_sse()
423 sum0[n] += va[0] * vb[n + 24]; in conv_im2col_sgemm_sse()
424 sum1[n] += va[1] * vb[n + 24]; in conv_im2col_sgemm_sse()
425 sum2[n] += va[2] * vb[n + 24]; in conv_im2col_sgemm_sse()
426 sum3[n] += va[3] * vb[n + 24]; in conv_im2col_sgemm_sse()
427 sum4[n] += va[4] * vb[n + 24]; in conv_im2col_sgemm_sse()
428 sum5[n] += va[5] * vb[n + 24]; in conv_im2col_sgemm_sse()
429 sum6[n] += va[6] * vb[n + 24]; in conv_im2col_sgemm_sse()
430 sum7[n] += va[7] * vb[n + 24]; in conv_im2col_sgemm_sse()
431 va += 8; in conv_im2col_sgemm_sse()
433 sum0[n] += va[0] * vb[n + 32]; in conv_im2col_sgemm_sse()
434 sum1[n] += va[1] * vb[n + 32]; in conv_im2col_sgemm_sse()
435 sum2[n] += va[2] * vb[n + 32]; in conv_im2col_sgemm_sse()
436 sum3[n] += va[3] * vb[n + 32]; in conv_im2col_sgemm_sse()
437 sum4[n] += va[4] * vb[n + 32]; in conv_im2col_sgemm_sse()
438 sum5[n] += va[5] * vb[n + 32]; in conv_im2col_sgemm_sse()
439 sum6[n] += va[6] * vb[n + 32]; in conv_im2col_sgemm_sse()
440 sum7[n] += va[7] * vb[n + 32]; in conv_im2col_sgemm_sse()
441 va += 8; in conv_im2col_sgemm_sse()
443 sum0[n] += va[0] * vb[n + 40]; in conv_im2col_sgemm_sse()
444 sum1[n] += va[1] * vb[n + 40]; in conv_im2col_sgemm_sse()
445 sum2[n] += va[2] * vb[n + 40]; in conv_im2col_sgemm_sse()
446 sum3[n] += va[3] * vb[n + 40]; in conv_im2col_sgemm_sse()
447 sum4[n] += va[4] * vb[n + 40]; in conv_im2col_sgemm_sse()
448 sum5[n] += va[5] * vb[n + 40]; in conv_im2col_sgemm_sse()
449 sum6[n] += va[6] * vb[n + 40]; in conv_im2col_sgemm_sse()
450 sum7[n] += va[7] * vb[n + 40]; in conv_im2col_sgemm_sse()
451 va += 8; in conv_im2col_sgemm_sse()
453 sum0[n] += va[0] * vb[n + 48]; in conv_im2col_sgemm_sse()
454 sum1[n] += va[1] * vb[n + 48]; in conv_im2col_sgemm_sse()
455 sum2[n] += va[2] * vb[n + 48]; in conv_im2col_sgemm_sse()
456 sum3[n] += va[3] * vb[n + 48]; in conv_im2col_sgemm_sse()
457 sum4[n] += va[4] * vb[n + 48]; in conv_im2col_sgemm_sse()
458 sum5[n] += va[5] * vb[n + 48]; in conv_im2col_sgemm_sse()
459 sum6[n] += va[6] * vb[n + 48]; in conv_im2col_sgemm_sse()
460 sum7[n] += va[7] * vb[n + 48]; in conv_im2col_sgemm_sse()
461 va += 8; in conv_im2col_sgemm_sse()
463 sum0[n] += va[0] * vb[n + 56]; in conv_im2col_sgemm_sse()
464 sum1[n] += va[1] * vb[n + 56]; in conv_im2col_sgemm_sse()
465 sum2[n] += va[2] * vb[n + 56]; in conv_im2col_sgemm_sse()
466 sum3[n] += va[3] * vb[n + 56]; in conv_im2col_sgemm_sse()
467 sum4[n] += va[4] * vb[n + 56]; in conv_im2col_sgemm_sse()
468 sum5[n] += va[5] * vb[n + 56]; in conv_im2col_sgemm_sse()
469 sum6[n] += va[6] * vb[n + 56]; in conv_im2col_sgemm_sse()
470 sum7[n] += va[7] * vb[n + 56]; in conv_im2col_sgemm_sse()
471 va -= 56; in conv_im2col_sgemm_sse()
474 va += 64; in conv_im2col_sgemm_sse()
482 sum0[n] += va[0] * vb[n]; in conv_im2col_sgemm_sse()
483 sum1[n] += va[1] * vb[n]; in conv_im2col_sgemm_sse()
484 sum2[n] += va[2] * vb[n]; in conv_im2col_sgemm_sse()
485 sum3[n] += va[3] * vb[n]; in conv_im2col_sgemm_sse()
486 sum4[n] += va[4] * vb[n]; in conv_im2col_sgemm_sse()
487 sum5[n] += va[5] * vb[n]; in conv_im2col_sgemm_sse()
488 sum6[n] += va[6] * vb[n]; in conv_im2col_sgemm_sse()
489 sum7[n] += va[7] * vb[n]; in conv_im2col_sgemm_sse()
492 va += 8; in conv_im2col_sgemm_sse()
521 const float* va = kernel_tm.channel(i / 8); in conv_im2col_sgemm_sse() local
537 __m256 _va0 = _mm256_loadu_ps(va); in conv_im2col_sgemm_sse()
538 __m256 _va1 = _mm256_loadu_ps(va + 8); in conv_im2col_sgemm_sse()
539 __m256 _va2 = _mm256_loadu_ps(va + 16); in conv_im2col_sgemm_sse()
540 __m256 _va3 = _mm256_loadu_ps(va + 24); in conv_im2col_sgemm_sse()
547 va += 32; in conv_im2col_sgemm_sse()
559 __m256 _va = _mm256_loadu_ps(va); in conv_im2col_sgemm_sse()
563 va += 8; in conv_im2col_sgemm_sse()
590 sum0 += va[0] * vb[0]; in conv_im2col_sgemm_sse()
591 sum1 += va[1] * vb[0]; in conv_im2col_sgemm_sse()
592 sum2 += va[2] * vb[0]; in conv_im2col_sgemm_sse()
593 sum3 += va[3] * vb[0]; in conv_im2col_sgemm_sse()
594 sum4 += va[4] * vb[0]; in conv_im2col_sgemm_sse()
595 sum5 += va[5] * vb[0]; in conv_im2col_sgemm_sse()
596 sum6 += va[6] * vb[0]; in conv_im2col_sgemm_sse()
597 sum7 += va[7] * vb[0]; in conv_im2col_sgemm_sse()
599 va += 8; in conv_im2col_sgemm_sse()
642 const float* va = kernel_tm.channel(i / 8 + (i % 8) / 4); in conv_im2col_sgemm_sse() local
653 __m256 _va0 = _mm256_broadcast_ss(va); in conv_im2col_sgemm_sse()
654 __m256 _va1 = _mm256_broadcast_ss(va + 1); in conv_im2col_sgemm_sse()
655 __m256 _va2 = _mm256_broadcast_ss(va + 2); in conv_im2col_sgemm_sse()
656 __m256 _va3 = _mm256_broadcast_ss(va + 3); in conv_im2col_sgemm_sse()
666 va += 4; in conv_im2col_sgemm_sse()
669 _va0 = _mm256_broadcast_ss(va); in conv_im2col_sgemm_sse()
670 _va1 = _mm256_broadcast_ss(va + 1); in conv_im2col_sgemm_sse()
671 _va2 = _mm256_broadcast_ss(va + 2); in conv_im2col_sgemm_sse()
672 _va3 = _mm256_broadcast_ss(va + 3); in conv_im2col_sgemm_sse()
678 va += 4; in conv_im2col_sgemm_sse()
681 _va0 = _mm256_broadcast_ss(va); in conv_im2col_sgemm_sse()
682 _va1 = _mm256_broadcast_ss(va + 1); in conv_im2col_sgemm_sse()
683 _va2 = _mm256_broadcast_ss(va + 2); in conv_im2col_sgemm_sse()
684 _va3 = _mm256_broadcast_ss(va + 3); in conv_im2col_sgemm_sse()
690 va += 4; in conv_im2col_sgemm_sse()
693 _va0 = _mm256_broadcast_ss(va); in conv_im2col_sgemm_sse()
694 _va1 = _mm256_broadcast_ss(va + 1); in conv_im2col_sgemm_sse()
695 _va2 = _mm256_broadcast_ss(va + 2); in conv_im2col_sgemm_sse()
696 _va3 = _mm256_broadcast_ss(va + 3); in conv_im2col_sgemm_sse()
702 va += 4; in conv_im2col_sgemm_sse()
709 __m256 _va0 = _mm256_broadcast_ss(va); in conv_im2col_sgemm_sse()
710 __m256 _va1 = _mm256_broadcast_ss(va + 1); in conv_im2col_sgemm_sse()
711 __m256 _va2 = _mm256_broadcast_ss(va + 2); in conv_im2col_sgemm_sse()
712 __m256 _va3 = _mm256_broadcast_ss(va + 3); in conv_im2col_sgemm_sse()
719 va += 4; in conv_im2col_sgemm_sse()
738 sum0[n] += va[0] * vb[n]; in conv_im2col_sgemm_sse()
739 sum1[n] += va[1] * vb[n]; in conv_im2col_sgemm_sse()
740 sum2[n] += va[2] * vb[n]; in conv_im2col_sgemm_sse()
741 sum3[n] += va[3] * vb[n]; in conv_im2col_sgemm_sse()
742 va += 4; in conv_im2col_sgemm_sse()
744 sum0[n] += va[0] * vb[n + 8]; in conv_im2col_sgemm_sse()
745 sum1[n] += va[1] * vb[n + 8]; in conv_im2col_sgemm_sse()
746 sum2[n] += va[2] * vb[n + 8]; in conv_im2col_sgemm_sse()
747 sum3[n] += va[3] * vb[n + 8]; in conv_im2col_sgemm_sse()
748 va += 4; in conv_im2col_sgemm_sse()
750 sum0[n] += va[0] * vb[n + 16]; in conv_im2col_sgemm_sse()
751 sum1[n] += va[1] * vb[n + 16]; in conv_im2col_sgemm_sse()
752 sum2[n] += va[2] * vb[n + 16]; in conv_im2col_sgemm_sse()
753 sum3[n] += va[3] * vb[n + 16]; in conv_im2col_sgemm_sse()
754 va += 4; in conv_im2col_sgemm_sse()
756 sum0[n] += va[0] * vb[n + 24]; in conv_im2col_sgemm_sse()
757 sum1[n] += va[1] * vb[n + 24]; in conv_im2col_sgemm_sse()
758 sum2[n] += va[2] * vb[n + 24]; in conv_im2col_sgemm_sse()
759 sum3[n] += va[3] * vb[n + 24]; in conv_im2col_sgemm_sse()
760 va += 4; in conv_im2col_sgemm_sse()
762 sum0[n] += va[0] * vb[n + 32]; in conv_im2col_sgemm_sse()
763 sum1[n] += va[1] * vb[n + 32]; in conv_im2col_sgemm_sse()
764 sum2[n] += va[2] * vb[n + 32]; in conv_im2col_sgemm_sse()
765 sum3[n] += va[3] * vb[n + 32]; in conv_im2col_sgemm_sse()
766 va += 4; in conv_im2col_sgemm_sse()
768 sum0[n] += va[0] * vb[n + 40]; in conv_im2col_sgemm_sse()
769 sum1[n] += va[1] * vb[n + 40]; in conv_im2col_sgemm_sse()
770 sum2[n] += va[2] * vb[n + 40]; in conv_im2col_sgemm_sse()
771 sum3[n] += va[3] * vb[n + 40]; in conv_im2col_sgemm_sse()
772 va += 4; in conv_im2col_sgemm_sse()
774 sum0[n] += va[0] * vb[n + 48]; in conv_im2col_sgemm_sse()
775 sum1[n] += va[1] * vb[n + 48]; in conv_im2col_sgemm_sse()
776 sum2[n] += va[2] * vb[n + 48]; in conv_im2col_sgemm_sse()
777 sum3[n] += va[3] * vb[n + 48]; in conv_im2col_sgemm_sse()
778 va += 4; in conv_im2col_sgemm_sse()
780 sum0[n] += va[0] * vb[n + 56]; in conv_im2col_sgemm_sse()
781 sum1[n] += va[1] * vb[n + 56]; in conv_im2col_sgemm_sse()
782 sum2[n] += va[2] * vb[n + 56]; in conv_im2col_sgemm_sse()
783 sum3[n] += va[3] * vb[n + 56]; in conv_im2col_sgemm_sse()
784 va -= 28; in conv_im2col_sgemm_sse()
787 va += 32; in conv_im2col_sgemm_sse()
795 sum0[n] += va[0] * vb[n]; in conv_im2col_sgemm_sse()
796 sum1[n] += va[1] * vb[n]; in conv_im2col_sgemm_sse()
797 sum2[n] += va[2] * vb[n]; in conv_im2col_sgemm_sse()
798 sum3[n] += va[3] * vb[n]; in conv_im2col_sgemm_sse()
801 va += 4; in conv_im2col_sgemm_sse()
822 const float* va = kernel_tm.channel(i / 8 + (i % 8) / 4); in conv_im2col_sgemm_sse() local
837 __m128 _va0 = _mm_loadu_ps(va); in conv_im2col_sgemm_sse()
838 __m128 _va1 = _mm_loadu_ps(va + 4); in conv_im2col_sgemm_sse()
839 __m128 _va2 = _mm_loadu_ps(va + 8); in conv_im2col_sgemm_sse()
840 __m128 _va3 = _mm_loadu_ps(va + 12); in conv_im2col_sgemm_sse()
847 va += 16; in conv_im2col_sgemm_sse()
859 __m128 _va = _mm_loadu_ps(va); in conv_im2col_sgemm_sse()
863 va += 4; in conv_im2col_sgemm_sse()
881 sum0 += va[0] * vb[0]; in conv_im2col_sgemm_sse()
882 sum1 += va[1] * vb[0]; in conv_im2col_sgemm_sse()
883 sum2 += va[2] * vb[0]; in conv_im2col_sgemm_sse()
884 sum3 += va[3] * vb[0]; in conv_im2col_sgemm_sse()
886 va += 4; in conv_im2col_sgemm_sse()
915 const float* va = kernel_tm.channel(i / 8 + (i % 8) / 4 + i % 4); in conv_im2col_sgemm_sse() local
923 __m256 _va0 = _mm256_broadcast_ss(va); in conv_im2col_sgemm_sse()
924 __m256 _va1 = _mm256_broadcast_ss(va + 1); in conv_im2col_sgemm_sse()
925 __m256 _va2 = _mm256_broadcast_ss(va + 2); in conv_im2col_sgemm_sse()
926 __m256 _va3 = _mm256_broadcast_ss(va + 3); in conv_im2col_sgemm_sse()
937 va += 4; in conv_im2col_sgemm_sse()
944 __m256 _va0 = _mm256_broadcast_ss(va); in conv_im2col_sgemm_sse()
949 va += 1; in conv_im2col_sgemm_sse()
962 sum[n] += va[0] * vb[n]; in conv_im2col_sgemm_sse()
963 sum[n] += va[1] * vb[n + 8]; in conv_im2col_sgemm_sse()
964 sum[n] += va[2] * vb[n + 16]; in conv_im2col_sgemm_sse()
965 sum[n] += va[3] * vb[n + 24]; in conv_im2col_sgemm_sse()
966 sum[n] += va[4] * vb[n + 32]; in conv_im2col_sgemm_sse()
967 sum[n] += va[5] * vb[n + 40]; in conv_im2col_sgemm_sse()
968 sum[n] += va[6] * vb[n + 48]; in conv_im2col_sgemm_sse()
969 sum[n] += va[7] * vb[n + 56]; in conv_im2col_sgemm_sse()
972 va += 8; in conv_im2col_sgemm_sse()
980 sum[n] += va[0] * vb[n]; in conv_im2col_sgemm_sse()
983 va += 1; in conv_im2col_sgemm_sse()
998 const float* va = kernel_tm.channel(i / 8 + (i % 8) / 4 + i % 4); in conv_im2col_sgemm_sse() local
1009 __m128 _k0 = _mm_loadu_ps(va); in conv_im2col_sgemm_sse()
1010 va += 4; in conv_im2col_sgemm_sse()
1025 sum0 += va[0] * vb[0]; in conv_im2col_sgemm_sse()
1027 va += 1; in conv_im2col_sgemm_sse()
1217 const float* va = kernel_tm.channel(i / 4); in conv_im2col_sgemm_sse() local
1229 __m128 _va0 = _mm_set1_ps(va[0]); in conv_im2col_sgemm_sse()
1230 __m128 _va1 = _mm_set1_ps(va[1]); in conv_im2col_sgemm_sse()
1231 __m128 _va2 = _mm_set1_ps(va[2]); in conv_im2col_sgemm_sse()
1232 __m128 _va3 = _mm_set1_ps(va[3]); in conv_im2col_sgemm_sse()
1240 _va0 = _mm_set1_ps(va[4]); in conv_im2col_sgemm_sse()
1241 _va1 = _mm_set1_ps(va[5]); in conv_im2col_sgemm_sse()
1242 _va2 = _mm_set1_ps(va[6]); in conv_im2col_sgemm_sse()
1243 _va3 = _mm_set1_ps(va[7]); in conv_im2col_sgemm_sse()
1251 _va0 = _mm_set1_ps(va[8]); in conv_im2col_sgemm_sse()
1252 _va1 = _mm_set1_ps(va[9]); in conv_im2col_sgemm_sse()
1253 _va2 = _mm_set1_ps(va[10]); in conv_im2col_sgemm_sse()
1254 _va3 = _mm_set1_ps(va[11]); in conv_im2col_sgemm_sse()
1262 _va0 = _mm_set1_ps(va[12]); in conv_im2col_sgemm_sse()
1263 _va1 = _mm_set1_ps(va[13]); in conv_im2col_sgemm_sse()
1264 _va2 = _mm_set1_ps(va[14]); in conv_im2col_sgemm_sse()
1265 _va3 = _mm_set1_ps(va[15]); in conv_im2col_sgemm_sse()
1271 va += 16; in conv_im2col_sgemm_sse()
1279 __m128 _va0 = _mm_set1_ps(va[0]); in conv_im2col_sgemm_sse()
1280 __m128 _va1 = _mm_set1_ps(va[1]); in conv_im2col_sgemm_sse()
1281 __m128 _va2 = _mm_set1_ps(va[2]); in conv_im2col_sgemm_sse()
1282 __m128 _va3 = _mm_set1_ps(va[3]); in conv_im2col_sgemm_sse()
1288 va += 4; in conv_im2col_sgemm_sse()
1306 sum0[n] += va[0] * vb[n]; in conv_im2col_sgemm_sse()
1307 sum1[n] += va[1] * vb[n]; in conv_im2col_sgemm_sse()
1308 sum2[n] += va[2] * vb[n]; in conv_im2col_sgemm_sse()
1309 sum3[n] += va[3] * vb[n]; in conv_im2col_sgemm_sse()
1310 va += 4; in conv_im2col_sgemm_sse()
1312 sum0[n] += va[0] * vb[n + 4]; in conv_im2col_sgemm_sse()
1313 sum1[n] += va[1] * vb[n + 4]; in conv_im2col_sgemm_sse()
1314 sum2[n] += va[2] * vb[n + 4]; in conv_im2col_sgemm_sse()
1315 sum3[n] += va[3] * vb[n + 4]; in conv_im2col_sgemm_sse()
1316 va += 4; in conv_im2col_sgemm_sse()
1318 sum0[n] += va[0] * vb[n + 8]; in conv_im2col_sgemm_sse()
1319 sum1[n] += va[1] * vb[n + 8]; in conv_im2col_sgemm_sse()
1320 sum2[n] += va[2] * vb[n + 8]; in conv_im2col_sgemm_sse()
1321 sum3[n] += va[3] * vb[n + 8]; in conv_im2col_sgemm_sse()
1322 va += 4; in conv_im2col_sgemm_sse()
1324 sum0[n] += va[0] * vb[n + 12]; in conv_im2col_sgemm_sse()
1325 sum1[n] += va[1] * vb[n + 12]; in conv_im2col_sgemm_sse()
1326 sum2[n] += va[2] * vb[n + 12]; in conv_im2col_sgemm_sse()
1327 sum3[n] += va[3] * vb[n + 12]; in conv_im2col_sgemm_sse()
1328 va += 4; in conv_im2col_sgemm_sse()
1330 sum0[n] += va[0] * vb[n + 16]; in conv_im2col_sgemm_sse()
1331 sum1[n] += va[1] * vb[n + 16]; in conv_im2col_sgemm_sse()
1332 sum2[n] += va[2] * vb[n + 16]; in conv_im2col_sgemm_sse()
1333 sum3[n] += va[3] * vb[n + 16]; in conv_im2col_sgemm_sse()
1334 va += 4; in conv_im2col_sgemm_sse()
1336 sum0[n] += va[0] * vb[n + 20]; in conv_im2col_sgemm_sse()
1337 sum1[n] += va[1] * vb[n + 20]; in conv_im2col_sgemm_sse()
1338 sum2[n] += va[2] * vb[n + 20]; in conv_im2col_sgemm_sse()
1339 sum3[n] += va[3] * vb[n + 20]; in conv_im2col_sgemm_sse()
1340 va += 4; in conv_im2col_sgemm_sse()
1342 sum0[n] += va[0] * vb[n + 24]; in conv_im2col_sgemm_sse()
1343 sum1[n] += va[1] * vb[n + 24]; in conv_im2col_sgemm_sse()
1344 sum2[n] += va[2] * vb[n + 24]; in conv_im2col_sgemm_sse()
1345 sum3[n] += va[3] * vb[n + 24]; in conv_im2col_sgemm_sse()
1346 va += 4; in conv_im2col_sgemm_sse()
1348 sum0[n] += va[0] * vb[n + 28]; in conv_im2col_sgemm_sse()
1349 sum1[n] += va[1] * vb[n + 28]; in conv_im2col_sgemm_sse()
1350 sum2[n] += va[2] * vb[n + 28]; in conv_im2col_sgemm_sse()
1351 sum3[n] += va[3] * vb[n + 28]; in conv_im2col_sgemm_sse()
1352 va -= 28; in conv_im2col_sgemm_sse()
1355 va += 32; in conv_im2col_sgemm_sse()
1363 sum0[n] += va[0] * vb[n]; in conv_im2col_sgemm_sse()
1364 sum1[n] += va[1] * vb[n]; in conv_im2col_sgemm_sse()
1365 sum2[n] += va[2] * vb[n]; in conv_im2col_sgemm_sse()
1366 sum3[n] += va[3] * vb[n]; in conv_im2col_sgemm_sse()
1369 va += 4; in conv_im2col_sgemm_sse()
1390 const float* va = kernel_tm.channel(i / 4); in conv_im2col_sgemm_sse() local
1405 __m128 _va0 = _mm_loadu_ps(va); in conv_im2col_sgemm_sse()
1406 __m128 _va1 = _mm_loadu_ps(va + 4); in conv_im2col_sgemm_sse()
1407 __m128 _va2 = _mm_loadu_ps(va + 8); in conv_im2col_sgemm_sse()
1408 __m128 _va3 = _mm_loadu_ps(va + 12); in conv_im2col_sgemm_sse()
1415 va += 16; in conv_im2col_sgemm_sse()
1427 __m128 _va = _mm_loadu_ps(va); in conv_im2col_sgemm_sse()
1431 va += 4; in conv_im2col_sgemm_sse()
1449 sum0 += va[0] * vb[0]; in conv_im2col_sgemm_sse()
1450 sum1 += va[1] * vb[0]; in conv_im2col_sgemm_sse()
1451 sum2 += va[2] * vb[0]; in conv_im2col_sgemm_sse()
1452 sum3 += va[3] * vb[0]; in conv_im2col_sgemm_sse()
1454 va += 4; in conv_im2col_sgemm_sse()
1481 const float* va = kernel_tm.channel(i / 4 + i % 4); in conv_im2col_sgemm_sse() local
1489 __m128 _va0 = _mm_set1_ps(va[0]); in conv_im2col_sgemm_sse()
1490 __m128 _va1 = _mm_set1_ps(va[1]); in conv_im2col_sgemm_sse()
1491 __m128 _va2 = _mm_set1_ps(va[2]); in conv_im2col_sgemm_sse()
1492 __m128 _va3 = _mm_set1_ps(va[3]); in conv_im2col_sgemm_sse()
1503 va += 4; in conv_im2col_sgemm_sse()
1510 __m128 _va0 = _mm_set1_ps(va[0]); in conv_im2col_sgemm_sse()
1515 va += 1; in conv_im2col_sgemm_sse()
1527 sum[n] += va[0] * vb[n]; in conv_im2col_sgemm_sse()
1528 sum[n] += va[1] * vb[n + 4]; in conv_im2col_sgemm_sse()
1529 sum[n] += va[2] * vb[n + 8]; in conv_im2col_sgemm_sse()
1530 sum[n] += va[3] * vb[n + 12]; in conv_im2col_sgemm_sse()
1537 va += 4; in conv_im2col_sgemm_sse()
1545 sum[n] += va[0] * vb[n]; in conv_im2col_sgemm_sse()
1548 va += 1; in conv_im2col_sgemm_sse()
1563 const float* va = kernel_tm.channel(i / 4 + i % 4); in conv_im2col_sgemm_sse() local
1572 __m128 _k0 = _mm_loadu_ps(va); in conv_im2col_sgemm_sse()
1575 va += 4; in conv_im2col_sgemm_sse()
1586 sum0 += va[0] * vb[0]; in conv_im2col_sgemm_sse()
1588 va += 1; in conv_im2col_sgemm_sse()