1 /*
2
3 Copyright (C) 2014, The University of Texas at Austin
4
5 This file is part of libflame and is available under the 3-Clause
6 BSD license, which can be found in the LICENSE file at the top-level
7 directory, or at http://opensource.org/licenses/BSD-3-Clause
8
9 */
10
11 #include "blis1.h"
12
bl1_icopymt(trans1_t trans,int m,int n,int * a,int a_rs,int a_cs,int * b,int b_rs,int b_cs)13 void bl1_icopymt( trans1_t trans, int m, int n, int* a, int a_rs, int a_cs, int* b, int b_rs, int b_cs )
14 {
15 int* a_begin;
16 int* b_begin;
17 int lda, inca;
18 int ldb, incb;
19 int n_iter;
20 int n_elem;
21 int j;
22
23 // Return early if possible.
24 if ( bl1_zero_dim2( m, n ) ) return;
25
26 // Handle cases where A and B are vectors to ensure that the underlying copy
27 // gets invoked only once.
28 if ( bl1_is_vector( m, n ) )
29 {
30 // Initialize with values appropriate for vectors.
31 n_iter = 1;
32 n_elem = bl1_vector_dim( m, n );
33 lda = 1; // multiplied by zero when n_iter == 1; not needed.
34 inca = bl1_vector_inc( trans, m, n, a_rs, a_cs );
35 ldb = 1; // multiplied by zero when n_iter == 1; not needed.
36 incb = bl1_vector_inc( BLIS1_NO_TRANSPOSE, m, n, b_rs, b_cs );
37 }
38 else // matrix case
39 {
40 // Initialize with optimal values for column-major storage.
41 n_iter = n;
42 n_elem = m;
43 lda = a_cs;
44 inca = a_rs;
45 ldb = b_cs;
46 incb = b_rs;
47
48 // Handle the transposition of A.
49 if ( bl1_does_trans( trans ) )
50 {
51 bl1_swap_ints( lda, inca );
52 }
53
54 // An optimization: if B is row-major and if A is effectively row-major
55 // after a possible transposition, then let's access the matrix by rows
56 // instead of by columns for increased spatial locality.
57 if ( bl1_is_row_storage( b_rs, b_cs ) )
58 {
59 if ( ( bl1_is_col_storage( a_rs, a_cs ) && bl1_does_trans( trans ) ) ||
60 ( bl1_is_row_storage( a_rs, a_cs ) && bl1_does_notrans( trans ) ) )
61 {
62 bl1_swap_ints( n_iter, n_elem );
63 bl1_swap_ints( lda, inca );
64 bl1_swap_ints( ldb, incb );
65 }
66 }
67 }
68
69 for ( j = 0; j < n_iter; j++ )
70 {
71 a_begin = a + j*lda;
72 b_begin = b + j*ldb;
73
74 bl1_icopyv( bl1_proj_trans1_to_conj( trans ),
75 n_elem,
76 a_begin, inca,
77 b_begin, incb );
78 }
79 }
80
bl1_scopymt(trans1_t trans,int m,int n,float * a,int a_rs,int a_cs,float * b,int b_rs,int b_cs)81 void bl1_scopymt( trans1_t trans, int m, int n, float* a, int a_rs, int a_cs, float* b, int b_rs, int b_cs )
82 {
83 float* a_begin;
84 float* b_begin;
85 int lda, inca;
86 int ldb, incb;
87 int n_iter;
88 int n_elem;
89 int j;
90
91 // Return early if possible.
92 if ( bl1_zero_dim2( m, n ) ) return;
93
94 // Handle cases where A and B are vectors to ensure that the underlying copy
95 // gets invoked only once.
96 if ( bl1_is_vector( m, n ) )
97 {
98 // Initialize with values appropriate for vectors.
99 n_iter = 1;
100 n_elem = bl1_vector_dim( m, n );
101 lda = 1; // multiplied by zero when n_iter == 1; not needed.
102 inca = bl1_vector_inc( trans, m, n, a_rs, a_cs );
103 ldb = 1; // multiplied by zero when n_iter == 1; not needed.
104 incb = bl1_vector_inc( BLIS1_NO_TRANSPOSE, m, n, b_rs, b_cs );
105 }
106 else // matrix case
107 {
108 // Initialize with optimal values for column-major storage.
109 n_iter = n;
110 n_elem = m;
111 lda = a_cs;
112 inca = a_rs;
113 ldb = b_cs;
114 incb = b_rs;
115
116 // Handle the transposition of A.
117 if ( bl1_does_trans( trans ) )
118 {
119 bl1_swap_ints( lda, inca );
120 }
121
122 // An optimization: if B is row-major and if A is effectively row-major
123 // after a possible transposition, then let's access the matrix by rows
124 // instead of by columns for increased spatial locality.
125 if ( bl1_is_row_storage( b_rs, b_cs ) )
126 {
127 if ( ( bl1_is_col_storage( a_rs, a_cs ) && bl1_does_trans( trans ) ) ||
128 ( bl1_is_row_storage( a_rs, a_cs ) && bl1_does_notrans( trans ) ) )
129 {
130 bl1_swap_ints( n_iter, n_elem );
131 bl1_swap_ints( lda, inca );
132 bl1_swap_ints( ldb, incb );
133 }
134 }
135 }
136
137 for ( j = 0; j < n_iter; j++ )
138 {
139 a_begin = a + j*lda;
140 b_begin = b + j*ldb;
141
142 bl1_scopy( n_elem,
143 a_begin, inca,
144 b_begin, incb );
145 }
146 }
147
bl1_dcopymt(trans1_t trans,int m,int n,double * a,int a_rs,int a_cs,double * b,int b_rs,int b_cs)148 void bl1_dcopymt( trans1_t trans, int m, int n, double* a, int a_rs, int a_cs, double* b, int b_rs, int b_cs )
149 {
150 double* a_begin;
151 double* b_begin;
152 int lda, inca;
153 int ldb, incb;
154 int n_iter;
155 int n_elem;
156 int j;
157
158 // Return early if possible.
159 if ( bl1_zero_dim2( m, n ) ) return;
160
161 // Handle cases where A and B are vectors to ensure that the underlying copy
162 // gets invoked only once.
163 if ( bl1_is_vector( m, n ) )
164 {
165 // Initialize with values appropriate for vectors.
166 n_iter = 1;
167 n_elem = bl1_vector_dim( m, n );
168 lda = 1; // multiplied by zero when n_iter == 1; not needed.
169 inca = bl1_vector_inc( trans, m, n, a_rs, a_cs );
170 ldb = 1; // multiplied by zero when n_iter == 1; not needed.
171 incb = bl1_vector_inc( BLIS1_NO_TRANSPOSE, m, n, b_rs, b_cs );
172 }
173 else // matrix case
174 {
175 // Initialize with optimal values for column-major storage.
176 n_iter = n;
177 n_elem = m;
178 lda = a_cs;
179 inca = a_rs;
180 ldb = b_cs;
181 incb = b_rs;
182
183 // Handle the transposition of A.
184 if ( bl1_does_trans( trans ) )
185 {
186 bl1_swap_ints( lda, inca );
187 }
188
189 // An optimization: if B is row-major and if A is effectively row-major
190 // after a possible transposition, then let's access the matrix by rows
191 // instead of by columns for increased spatial locality.
192 if ( bl1_is_row_storage( b_rs, b_cs ) )
193 {
194 if ( ( bl1_is_col_storage( a_rs, a_cs ) && bl1_does_trans( trans ) ) ||
195 ( bl1_is_row_storage( a_rs, a_cs ) && bl1_does_notrans( trans ) ) )
196 {
197 bl1_swap_ints( n_iter, n_elem );
198 bl1_swap_ints( lda, inca );
199 bl1_swap_ints( ldb, incb );
200 }
201 }
202 }
203
204 for ( j = 0; j < n_iter; j++ )
205 {
206 a_begin = a + j*lda;
207 b_begin = b + j*ldb;
208
209 bl1_dcopy( n_elem,
210 a_begin, inca,
211 b_begin, incb );
212 }
213 }
214
bl1_ccopymt(trans1_t trans,int m,int n,scomplex * a,int a_rs,int a_cs,scomplex * b,int b_rs,int b_cs)215 void bl1_ccopymt( trans1_t trans, int m, int n, scomplex* a, int a_rs, int a_cs, scomplex* b, int b_rs, int b_cs )
216 {
217 scomplex* a_begin;
218 scomplex* b_begin;
219 int lda, inca;
220 int ldb, incb;
221 int n_iter;
222 int n_elem;
223 int j;
224
225 // Return early if possible.
226 if ( bl1_zero_dim2( m, n ) ) return;
227
228 // Handle cases where A and B are vectors to ensure that the underlying copy
229 // gets invoked only once.
230 if ( bl1_is_vector( m, n ) )
231 {
232 // Initialize with values appropriate for vectors.
233 n_iter = 1;
234 n_elem = bl1_vector_dim( m, n );
235 lda = 1; // multiplied by zero when n_iter == 1; not needed.
236 inca = bl1_vector_inc( trans, m, n, a_rs, a_cs );
237 ldb = 1; // multiplied by zero when n_iter == 1; not needed.
238 incb = bl1_vector_inc( BLIS1_NO_TRANSPOSE, m, n, b_rs, b_cs );
239 }
240 else // matrix case
241 {
242 // Initialize with optimal values for column-major storage.
243 n_iter = n;
244 n_elem = m;
245 lda = a_cs;
246 inca = a_rs;
247 ldb = b_cs;
248 incb = b_rs;
249
250 // Handle the transposition of A.
251 if ( bl1_does_trans( trans ) )
252 {
253 bl1_swap_ints( lda, inca );
254 }
255
256 // An optimization: if B is row-major and if A is effectively row-major
257 // after a possible transposition, then let's access the matrix by rows
258 // instead of by columns for increased spatial locality.
259 if ( bl1_is_row_storage( b_rs, b_cs ) )
260 {
261 if ( ( bl1_is_col_storage( a_rs, a_cs ) && bl1_does_trans( trans ) ) ||
262 ( bl1_is_row_storage( a_rs, a_cs ) && bl1_does_notrans( trans ) ) )
263 {
264 bl1_swap_ints( n_iter, n_elem );
265 bl1_swap_ints( lda, inca );
266 bl1_swap_ints( ldb, incb );
267 }
268 }
269 }
270
271 for ( j = 0; j < n_iter; j++ )
272 {
273 a_begin = a + j*lda;
274 b_begin = b + j*ldb;
275
276 bl1_ccopy( n_elem,
277 a_begin, inca,
278 b_begin, incb );
279
280 if ( bl1_does_conj( trans ) )
281 bl1_cconjv( n_elem,
282 b_begin, incb );
283 }
284 }
285
bl1_zcopymt(trans1_t trans,int m,int n,dcomplex * a,int a_rs,int a_cs,dcomplex * b,int b_rs,int b_cs)286 void bl1_zcopymt( trans1_t trans, int m, int n, dcomplex* a, int a_rs, int a_cs, dcomplex* b, int b_rs, int b_cs )
287 {
288 dcomplex* a_begin;
289 dcomplex* b_begin;
290 int lda, inca;
291 int ldb, incb;
292 int n_iter;
293 int n_elem;
294 int j;
295
296 // Return early if possible.
297 if ( bl1_zero_dim2( m, n ) ) return;
298
299 // Handle cases where A and B are vectors to ensure that the underlying copy
300 // gets invoked only once.
301 if ( bl1_is_vector( m, n ) )
302 {
303 // Initialize with values appropriate for vectors.
304 n_iter = 1;
305 n_elem = bl1_vector_dim( m, n );
306 lda = 1; // multiplied by zero when n_iter == 1; not needed.
307 inca = bl1_vector_inc( trans, m, n, a_rs, a_cs );
308 ldb = 1; // multiplied by zero when n_iter == 1; not needed.
309 incb = bl1_vector_inc( BLIS1_NO_TRANSPOSE, m, n, b_rs, b_cs );
310 }
311 else // matrix case
312 {
313 // Initialize with optimal values for column-major storage.
314 n_iter = n;
315 n_elem = m;
316 lda = a_cs;
317 inca = a_rs;
318 ldb = b_cs;
319 incb = b_rs;
320
321 // Handle the transposition of A.
322 if ( bl1_does_trans( trans ) )
323 {
324 bl1_swap_ints( lda, inca );
325 }
326
327 // An optimization: if B is row-major and if A is effectively row-major
328 // after a possible transposition, then let's access the matrix by rows
329 // instead of by columns for increased spatial locality.
330 if ( bl1_is_row_storage( b_rs, b_cs ) )
331 {
332 if ( ( bl1_is_col_storage( a_rs, a_cs ) && bl1_does_trans( trans ) ) ||
333 ( bl1_is_row_storage( a_rs, a_cs ) && bl1_does_notrans( trans ) ) )
334 {
335 bl1_swap_ints( n_iter, n_elem );
336 bl1_swap_ints( lda, inca );
337 bl1_swap_ints( ldb, incb );
338 }
339 }
340 }
341
342 for ( j = 0; j < n_iter; j++ )
343 {
344 a_begin = a + j*lda;
345 b_begin = b + j*ldb;
346
347 bl1_zcopy( n_elem,
348 a_begin, inca,
349 b_begin, incb );
350
351 if ( bl1_does_conj( trans ) )
352 bl1_zconjv( n_elem,
353 b_begin, incb );
354 }
355 }
356
357 // --- Mixed-datatype and general stride copy routines---------------
358
359 // ss
bl1_sscopymt(trans1_t trans,int m,int n,float * a,int a_rs,int a_cs,float * b,int b_rs,int b_cs)360 void bl1_sscopymt( trans1_t trans, int m, int n, float* a, int a_rs, int a_cs, float* b, int b_rs, int b_cs )
361 {
362 float* a_begin;
363 float* b_begin;
364 int lda, inca;
365 int ldb, incb;
366 int n_iter;
367 int n_elem;
368 int j;
369 conj1_t conj;
370
371 // Return early if possible.
372 if ( bl1_zero_dim2( m, n ) ) return;
373
374 // Handle cases where A and B are vectors to ensure that the underlying copy
375 // gets invoked only once.
376 if ( bl1_is_vector( m, n ) )
377 {
378 // Initialize with values appropriate for vectors.
379 n_iter = 1;
380 n_elem = bl1_vector_dim( m, n );
381 lda = 1; // multiplied by zero when n_iter == 1; not needed.
382 inca = bl1_vector_inc( trans, m, n, a_rs, a_cs );
383 ldb = 1; // multiplied by zero when n_iter == 1; not needed.
384 incb = bl1_vector_inc( BLIS1_NO_TRANSPOSE, m, n, b_rs, b_cs );
385 }
386 else // matrix case
387 {
388 // Initialize with optimal values for column-major storage of B.
389 n_iter = n;
390 n_elem = m;
391 lda = a_cs;
392 inca = a_rs;
393 ldb = b_cs;
394 incb = b_rs;
395
396 // Handle the transposition of A.
397 if ( bl1_does_trans( trans ) )
398 {
399 bl1_swap_ints( lda, inca );
400 }
401
402 // An optimization: if B is row-major, then let's access the matrix by rows
403 // instead of by columns for increased spatial locality.
404 if ( bl1_is_row_storage( b_rs, b_cs ) )
405 {
406 bl1_swap_ints( n_iter, n_elem );
407 bl1_swap_ints( lda, inca );
408 bl1_swap_ints( ldb, incb );
409 }
410 }
411
412 // Extract conj component from trans parameter.
413 conj = bl1_proj_trans1_to_conj( trans );
414
415 for ( j = 0; j < n_iter; ++j )
416 {
417 a_begin = a + j*lda;
418 b_begin = b + j*ldb;
419
420 bl1_scopyv( conj,
421 n_elem,
422 a_begin, inca,
423 b_begin, incb );
424 }
425 }
426
427 // sd ds
bl1_sdcopymt(trans1_t trans,int m,int n,float * a,int a_rs,int a_cs,double * b,int b_rs,int b_cs)428 void bl1_sdcopymt( trans1_t trans, int m, int n, float* a, int a_rs, int a_cs, double* b, int b_rs, int b_cs )
429 {
430 float* a_begin;
431 double* b_begin;
432 int lda, inca;
433 int ldb, incb;
434 int n_iter;
435 int n_elem;
436 int j;
437 conj1_t conj;
438
439 // Return early if possible.
440 if ( bl1_zero_dim2( m, n ) ) return;
441
442 // Handle cases where A and B are vectors to ensure that the underlying copy
443 // gets invoked only once.
444 if ( bl1_is_vector( m, n ) )
445 {
446 // Initialize with values appropriate for vectors.
447 n_iter = 1;
448 n_elem = bl1_vector_dim( m, n );
449 lda = 1; // multiplied by zero when n_iter == 1; not needed.
450 inca = bl1_vector_inc( trans, m, n, a_rs, a_cs );
451 ldb = 1; // multiplied by zero when n_iter == 1; not needed.
452 incb = bl1_vector_inc( BLIS1_NO_TRANSPOSE, m, n, b_rs, b_cs );
453 }
454 else // matrix case
455 {
456 // Initialize with optimal values for column-major storage of B.
457 n_iter = n;
458 n_elem = m;
459 lda = a_cs;
460 inca = a_rs;
461 ldb = b_cs;
462 incb = b_rs;
463
464 // Handle the transposition of A.
465 if ( bl1_does_trans( trans ) )
466 {
467 bl1_swap_ints( lda, inca );
468 }
469
470 // An optimization: if B is row-major, then let's access the matrix by rows
471 // instead of by columns for increased spatial locality.
472 if ( bl1_is_row_storage( b_rs, b_cs ) )
473 {
474 bl1_swap_ints( n_iter, n_elem );
475 bl1_swap_ints( lda, inca );
476 bl1_swap_ints( ldb, incb );
477 }
478 }
479
480 // Extract conj component from trans parameter.
481 conj = bl1_proj_trans1_to_conj( trans );
482
483 for ( j = 0; j < n_iter; ++j )
484 {
485 a_begin = a + j*lda;
486 b_begin = b + j*ldb;
487
488 bl1_sdcopyv( conj,
489 n_elem,
490 a_begin, inca,
491 b_begin, incb );
492 }
493 }
bl1_dscopymt(trans1_t trans,int m,int n,double * a,int a_rs,int a_cs,float * b,int b_rs,int b_cs)494 void bl1_dscopymt( trans1_t trans, int m, int n, double* a, int a_rs, int a_cs, float* b, int b_rs, int b_cs )
495 {
496 double* a_begin;
497 float* b_begin;
498 int lda, inca;
499 int ldb, incb;
500 int n_iter;
501 int n_elem;
502 int j;
503 conj1_t conj;
504
505 // Return early if possible.
506 if ( bl1_zero_dim2( m, n ) ) return;
507
508 // Handle cases where A and B are vectors to ensure that the underlying copy
509 // gets invoked only once.
510 if ( bl1_is_vector( m, n ) )
511 {
512 // Initialize with values appropriate for vectors.
513 n_iter = 1;
514 n_elem = bl1_vector_dim( m, n );
515 lda = 1; // multiplied by zero when n_iter == 1; not needed.
516 inca = bl1_vector_inc( trans, m, n, a_rs, a_cs );
517 ldb = 1; // multiplied by zero when n_iter == 1; not needed.
518 incb = bl1_vector_inc( BLIS1_NO_TRANSPOSE, m, n, b_rs, b_cs );
519 }
520 else // matrix case
521 {
522 // Initialize with optimal values for column-major storage of B.
523 n_iter = n;
524 n_elem = m;
525 lda = a_cs;
526 inca = a_rs;
527 ldb = b_cs;
528 incb = b_rs;
529
530 // Handle the transposition of A.
531 if ( bl1_does_trans( trans ) )
532 {
533 bl1_swap_ints( lda, inca );
534 }
535
536 // An optimization: if B is row-major, then let's access the matrix by rows
537 // instead of by columns for increased spatial locality.
538 if ( bl1_is_row_storage( b_rs, b_cs ) )
539 {
540 bl1_swap_ints( n_iter, n_elem );
541 bl1_swap_ints( lda, inca );
542 bl1_swap_ints( ldb, incb );
543 }
544 }
545
546 // Extract conj component from trans parameter.
547 conj = bl1_proj_trans1_to_conj( trans );
548
549 for ( j = 0; j < n_iter; ++j )
550 {
551 a_begin = a + j*lda;
552 b_begin = b + j*ldb;
553
554 bl1_dscopyv( conj,
555 n_elem,
556 a_begin, inca,
557 b_begin, incb );
558 }
559 }
560
561 // sc cs
bl1_sccopymt(trans1_t trans,int m,int n,float * a,int a_rs,int a_cs,scomplex * b,int b_rs,int b_cs)562 void bl1_sccopymt( trans1_t trans, int m, int n, float* a, int a_rs, int a_cs, scomplex* b, int b_rs, int b_cs )
563 {
564 float* a_begin;
565 scomplex* b_begin;
566 int lda, inca;
567 int ldb, incb;
568 int n_iter;
569 int n_elem;
570 int j;
571 conj1_t conj;
572
573 // Return early if possible.
574 if ( bl1_zero_dim2( m, n ) ) return;
575
576 // Handle cases where A and B are vectors to ensure that the underlying copy
577 // gets invoked only once.
578 if ( bl1_is_vector( m, n ) )
579 {
580 // Initialize with values appropriate for vectors.
581 n_iter = 1;
582 n_elem = bl1_vector_dim( m, n );
583 lda = 1; // multiplied by zero when n_iter == 1; not needed.
584 inca = bl1_vector_inc( trans, m, n, a_rs, a_cs );
585 ldb = 1; // multiplied by zero when n_iter == 1; not needed.
586 incb = bl1_vector_inc( BLIS1_NO_TRANSPOSE, m, n, b_rs, b_cs );
587 }
588 else // matrix case
589 {
590 // Initialize with optimal values for column-major storage of B.
591 n_iter = n;
592 n_elem = m;
593 lda = a_cs;
594 inca = a_rs;
595 ldb = b_cs;
596 incb = b_rs;
597
598 // Handle the transposition of A.
599 if ( bl1_does_trans( trans ) )
600 {
601 bl1_swap_ints( lda, inca );
602 }
603
604 // An optimization: if B is row-major, then let's access the matrix by rows
605 // instead of by columns for increased spatial locality.
606 if ( bl1_is_row_storage( b_rs, b_cs ) )
607 {
608 bl1_swap_ints( n_iter, n_elem );
609 bl1_swap_ints( lda, inca );
610 bl1_swap_ints( ldb, incb );
611 }
612 }
613
614 // Extract conj component from trans parameter.
615 conj = bl1_proj_trans1_to_conj( trans );
616
617 for ( j = 0; j < n_iter; ++j )
618 {
619 a_begin = a + j*lda;
620 b_begin = b + j*ldb;
621
622 bl1_sccopyv( conj,
623 n_elem,
624 a_begin, inca,
625 b_begin, incb );
626 }
627 }
bl1_cscopymt(trans1_t trans,int m,int n,scomplex * a,int a_rs,int a_cs,float * b,int b_rs,int b_cs)628 void bl1_cscopymt( trans1_t trans, int m, int n, scomplex* a, int a_rs, int a_cs, float* b, int b_rs, int b_cs )
629 {
630 scomplex* a_begin;
631 float* b_begin;
632 int lda, inca;
633 int ldb, incb;
634 int n_iter;
635 int n_elem;
636 int j;
637 conj1_t conj;
638
639 // Return early if possible.
640 if ( bl1_zero_dim2( m, n ) ) return;
641
642 // Handle cases where A and B are vectors to ensure that the underlying copy
643 // gets invoked only once.
644 if ( bl1_is_vector( m, n ) )
645 {
646 // Initialize with values appropriate for vectors.
647 n_iter = 1;
648 n_elem = bl1_vector_dim( m, n );
649 lda = 1; // multiplied by zero when n_iter == 1; not needed.
650 inca = bl1_vector_inc( trans, m, n, a_rs, a_cs );
651 ldb = 1; // multiplied by zero when n_iter == 1; not needed.
652 incb = bl1_vector_inc( BLIS1_NO_TRANSPOSE, m, n, b_rs, b_cs );
653 }
654 else // matrix case
655 {
656 // Initialize with optimal values for column-major storage of B.
657 n_iter = n;
658 n_elem = m;
659 lda = a_cs;
660 inca = a_rs;
661 ldb = b_cs;
662 incb = b_rs;
663
664 // Handle the transposition of A.
665 if ( bl1_does_trans( trans ) )
666 {
667 bl1_swap_ints( lda, inca );
668 }
669
670 // An optimization: if B is row-major, then let's access the matrix by rows
671 // instead of by columns for increased spatial locality.
672 if ( bl1_is_row_storage( b_rs, b_cs ) )
673 {
674 bl1_swap_ints( n_iter, n_elem );
675 bl1_swap_ints( lda, inca );
676 bl1_swap_ints( ldb, incb );
677 }
678 }
679
680 // Extract conj component from trans parameter.
681 conj = bl1_proj_trans1_to_conj( trans );
682
683 for ( j = 0; j < n_iter; ++j )
684 {
685 a_begin = a + j*lda;
686 b_begin = b + j*ldb;
687
688 bl1_cscopyv( conj,
689 n_elem,
690 a_begin, inca,
691 b_begin, incb );
692 }
693 }
694
695 // sz zs
bl1_szcopymt(trans1_t trans,int m,int n,float * a,int a_rs,int a_cs,dcomplex * b,int b_rs,int b_cs)696 void bl1_szcopymt( trans1_t trans, int m, int n, float* a, int a_rs, int a_cs, dcomplex* b, int b_rs, int b_cs )
697 {
698 float* a_begin;
699 dcomplex* b_begin;
700 int lda, inca;
701 int ldb, incb;
702 int n_iter;
703 int n_elem;
704 int j;
705 conj1_t conj;
706
707 // Return early if possible.
708 if ( bl1_zero_dim2( m, n ) ) return;
709
710 // Handle cases where A and B are vectors to ensure that the underlying copy
711 // gets invoked only once.
712 if ( bl1_is_vector( m, n ) )
713 {
714 // Initialize with values appropriate for vectors.
715 n_iter = 1;
716 n_elem = bl1_vector_dim( m, n );
717 lda = 1; // multiplied by zero when n_iter == 1; not needed.
718 inca = bl1_vector_inc( trans, m, n, a_rs, a_cs );
719 ldb = 1; // multiplied by zero when n_iter == 1; not needed.
720 incb = bl1_vector_inc( BLIS1_NO_TRANSPOSE, m, n, b_rs, b_cs );
721 }
722 else // matrix case
723 {
724 // Initialize with optimal values for column-major storage of B.
725 n_iter = n;
726 n_elem = m;
727 lda = a_cs;
728 inca = a_rs;
729 ldb = b_cs;
730 incb = b_rs;
731
732 // Handle the transposition of A.
733 if ( bl1_does_trans( trans ) )
734 {
735 bl1_swap_ints( lda, inca );
736 }
737
738 // An optimization: if B is row-major, then let's access the matrix by rows
739 // instead of by columns for increased spatial locality.
740 if ( bl1_is_row_storage( b_rs, b_cs ) )
741 {
742 bl1_swap_ints( n_iter, n_elem );
743 bl1_swap_ints( lda, inca );
744 bl1_swap_ints( ldb, incb );
745 }
746 }
747
748 // Extract conj component from trans parameter.
749 conj = bl1_proj_trans1_to_conj( trans );
750
751 for ( j = 0; j < n_iter; ++j )
752 {
753 a_begin = a + j*lda;
754 b_begin = b + j*ldb;
755
756 bl1_szcopyv( conj,
757 n_elem,
758 a_begin, inca,
759 b_begin, incb );
760 }
761 }
bl1_zscopymt(trans1_t trans,int m,int n,dcomplex * a,int a_rs,int a_cs,float * b,int b_rs,int b_cs)762 void bl1_zscopymt( trans1_t trans, int m, int n, dcomplex* a, int a_rs, int a_cs, float* b, int b_rs, int b_cs )
763 {
764 dcomplex* a_begin;
765 float* b_begin;
766 int lda, inca;
767 int ldb, incb;
768 int n_iter;
769 int n_elem;
770 int j;
771 conj1_t conj;
772
773 // Return early if possible.
774 if ( bl1_zero_dim2( m, n ) ) return;
775
776 // Handle cases where A and B are vectors to ensure that the underlying copy
777 // gets invoked only once.
778 if ( bl1_is_vector( m, n ) )
779 {
780 // Initialize with values appropriate for vectors.
781 n_iter = 1;
782 n_elem = bl1_vector_dim( m, n );
783 lda = 1; // multiplied by zero when n_iter == 1; not needed.
784 inca = bl1_vector_inc( trans, m, n, a_rs, a_cs );
785 ldb = 1; // multiplied by zero when n_iter == 1; not needed.
786 incb = bl1_vector_inc( BLIS1_NO_TRANSPOSE, m, n, b_rs, b_cs );
787 }
788 else // matrix case
789 {
790 // Initialize with optimal values for column-major storage of B.
791 n_iter = n;
792 n_elem = m;
793 lda = a_cs;
794 inca = a_rs;
795 ldb = b_cs;
796 incb = b_rs;
797
798 // Handle the transposition of A.
799 if ( bl1_does_trans( trans ) )
800 {
801 bl1_swap_ints( lda, inca );
802 }
803
804 // An optimization: if B is row-major, then let's access the matrix by rows
805 // instead of by columns for increased spatial locality.
806 if ( bl1_is_row_storage( b_rs, b_cs ) )
807 {
808 bl1_swap_ints( n_iter, n_elem );
809 bl1_swap_ints( lda, inca );
810 bl1_swap_ints( ldb, incb );
811 }
812 }
813
814 // Extract conj component from trans parameter.
815 conj = bl1_proj_trans1_to_conj( trans );
816
817 for ( j = 0; j < n_iter; ++j )
818 {
819 a_begin = a + j*lda;
820 b_begin = b + j*ldb;
821
822 bl1_zscopyv( conj,
823 n_elem,
824 a_begin, inca,
825 b_begin, incb );
826 }
827 }
828
829 // dd
bl1_ddcopymt(trans1_t trans,int m,int n,double * a,int a_rs,int a_cs,double * b,int b_rs,int b_cs)830 void bl1_ddcopymt( trans1_t trans, int m, int n, double* a, int a_rs, int a_cs, double* b, int b_rs, int b_cs )
831 {
832 double* a_begin;
833 double* b_begin;
834 int lda, inca;
835 int ldb, incb;
836 int n_iter;
837 int n_elem;
838 int j;
839 conj1_t conj;
840
841 // Return early if possible.
842 if ( bl1_zero_dim2( m, n ) ) return;
843
844 // Handle cases where A and B are vectors to ensure that the underlying copy
845 // gets invoked only once.
846 if ( bl1_is_vector( m, n ) )
847 {
848 // Initialize with values appropriate for vectors.
849 n_iter = 1;
850 n_elem = bl1_vector_dim( m, n );
851 lda = 1; // multiplied by zero when n_iter == 1; not needed.
852 inca = bl1_vector_inc( trans, m, n, a_rs, a_cs );
853 ldb = 1; // multiplied by zero when n_iter == 1; not needed.
854 incb = bl1_vector_inc( BLIS1_NO_TRANSPOSE, m, n, b_rs, b_cs );
855 }
856 else // matrix case
857 {
858 // Initialize with optimal values for column-major storage of B.
859 n_iter = n;
860 n_elem = m;
861 lda = a_cs;
862 inca = a_rs;
863 ldb = b_cs;
864 incb = b_rs;
865
866 // Handle the transposition of A.
867 if ( bl1_does_trans( trans ) )
868 {
869 bl1_swap_ints( lda, inca );
870 }
871
872 // An optimization: if B is row-major, then let's access the matrix by rows
873 // instead of by columns for increased spatial locality.
874 if ( bl1_is_row_storage( b_rs, b_cs ) )
875 {
876 bl1_swap_ints( n_iter, n_elem );
877 bl1_swap_ints( lda, inca );
878 bl1_swap_ints( ldb, incb );
879 }
880 }
881
882 // Extract conj component from trans parameter.
883 conj = bl1_proj_trans1_to_conj( trans );
884
885 for ( j = 0; j < n_iter; ++j )
886 {
887 a_begin = a + j*lda;
888 b_begin = b + j*ldb;
889
890 bl1_dcopyv( conj,
891 n_elem,
892 a_begin, inca,
893 b_begin, incb );
894 }
895 }
896
897 // dc cd
bl1_dccopymt(trans1_t trans,int m,int n,double * a,int a_rs,int a_cs,scomplex * b,int b_rs,int b_cs)898 void bl1_dccopymt( trans1_t trans, int m, int n, double* a, int a_rs, int a_cs, scomplex* b, int b_rs, int b_cs )
899 {
900 double* a_begin;
901 scomplex* b_begin;
902 int lda, inca;
903 int ldb, incb;
904 int n_iter;
905 int n_elem;
906 int j;
907 conj1_t conj;
908
909 // Return early if possible.
910 if ( bl1_zero_dim2( m, n ) ) return;
911
912 // Handle cases where A and B are vectors to ensure that the underlying copy
913 // gets invoked only once.
914 if ( bl1_is_vector( m, n ) )
915 {
916 // Initialize with values appropriate for vectors.
917 n_iter = 1;
918 n_elem = bl1_vector_dim( m, n );
919 lda = 1; // multiplied by zero when n_iter == 1; not needed.
920 inca = bl1_vector_inc( trans, m, n, a_rs, a_cs );
921 ldb = 1; // multiplied by zero when n_iter == 1; not needed.
922 incb = bl1_vector_inc( BLIS1_NO_TRANSPOSE, m, n, b_rs, b_cs );
923 }
924 else // matrix case
925 {
926 // Initialize with optimal values for column-major storage of B.
927 n_iter = n;
928 n_elem = m;
929 lda = a_cs;
930 inca = a_rs;
931 ldb = b_cs;
932 incb = b_rs;
933
934 // Handle the transposition of A.
935 if ( bl1_does_trans( trans ) )
936 {
937 bl1_swap_ints( lda, inca );
938 }
939
940 // An optimization: if B is row-major, then let's access the matrix by rows
941 // instead of by columns for increased spatial locality.
942 if ( bl1_is_row_storage( b_rs, b_cs ) )
943 {
944 bl1_swap_ints( n_iter, n_elem );
945 bl1_swap_ints( lda, inca );
946 bl1_swap_ints( ldb, incb );
947 }
948 }
949
950 // Extract conj component from trans parameter.
951 conj = bl1_proj_trans1_to_conj( trans );
952
953 for ( j = 0; j < n_iter; ++j )
954 {
955 a_begin = a + j*lda;
956 b_begin = b + j*ldb;
957
958 bl1_dccopyv( conj,
959 n_elem,
960 a_begin, inca,
961 b_begin, incb );
962 }
963 }
bl1_cdcopymt(trans1_t trans,int m,int n,scomplex * a,int a_rs,int a_cs,double * b,int b_rs,int b_cs)964 void bl1_cdcopymt( trans1_t trans, int m, int n, scomplex* a, int a_rs, int a_cs, double* b, int b_rs, int b_cs )
965 {
966 scomplex* a_begin;
967 double* b_begin;
968 int lda, inca;
969 int ldb, incb;
970 int n_iter;
971 int n_elem;
972 int j;
973 conj1_t conj;
974
975 // Return early if possible.
976 if ( bl1_zero_dim2( m, n ) ) return;
977
978 // Handle cases where A and B are vectors to ensure that the underlying copy
979 // gets invoked only once.
980 if ( bl1_is_vector( m, n ) )
981 {
982 // Initialize with values appropriate for vectors.
983 n_iter = 1;
984 n_elem = bl1_vector_dim( m, n );
985 lda = 1; // multiplied by zero when n_iter == 1; not needed.
986 inca = bl1_vector_inc( trans, m, n, a_rs, a_cs );
987 ldb = 1; // multiplied by zero when n_iter == 1; not needed.
988 incb = bl1_vector_inc( BLIS1_NO_TRANSPOSE, m, n, b_rs, b_cs );
989 }
990 else // matrix case
991 {
992 // Initialize with optimal values for column-major storage of B.
993 n_iter = n;
994 n_elem = m;
995 lda = a_cs;
996 inca = a_rs;
997 ldb = b_cs;
998 incb = b_rs;
999
1000 // Handle the transposition of A.
1001 if ( bl1_does_trans( trans ) )
1002 {
1003 bl1_swap_ints( lda, inca );
1004 }
1005
1006 // An optimization: if B is row-major, then let's access the matrix by rows
1007 // instead of by columns for increased spatial locality.
1008 if ( bl1_is_row_storage( b_rs, b_cs ) )
1009 {
1010 bl1_swap_ints( n_iter, n_elem );
1011 bl1_swap_ints( lda, inca );
1012 bl1_swap_ints( ldb, incb );
1013 }
1014 }
1015
1016 // Extract conj component from trans parameter.
1017 conj = bl1_proj_trans1_to_conj( trans );
1018
1019 for ( j = 0; j < n_iter; ++j )
1020 {
1021 a_begin = a + j*lda;
1022 b_begin = b + j*ldb;
1023
1024 bl1_cdcopyv( conj,
1025 n_elem,
1026 a_begin, inca,
1027 b_begin, incb );
1028 }
1029 }
1030
1031 // dz zd
bl1_dzcopymt(trans1_t trans,int m,int n,double * a,int a_rs,int a_cs,dcomplex * b,int b_rs,int b_cs)1032 void bl1_dzcopymt( trans1_t trans, int m, int n, double* a, int a_rs, int a_cs, dcomplex* b, int b_rs, int b_cs )
1033 {
1034 double* a_begin;
1035 dcomplex* b_begin;
1036 int lda, inca;
1037 int ldb, incb;
1038 int n_iter;
1039 int n_elem;
1040 int j;
1041 conj1_t conj;
1042
1043 // Return early if possible.
1044 if ( bl1_zero_dim2( m, n ) ) return;
1045
1046 // Handle cases where A and B are vectors to ensure that the underlying copy
1047 // gets invoked only once.
1048 if ( bl1_is_vector( m, n ) )
1049 {
1050 // Initialize with values appropriate for vectors.
1051 n_iter = 1;
1052 n_elem = bl1_vector_dim( m, n );
1053 lda = 1; // multiplied by zero when n_iter == 1; not needed.
1054 inca = bl1_vector_inc( trans, m, n, a_rs, a_cs );
1055 ldb = 1; // multiplied by zero when n_iter == 1; not needed.
1056 incb = bl1_vector_inc( BLIS1_NO_TRANSPOSE, m, n, b_rs, b_cs );
1057 }
1058 else // matrix case
1059 {
1060 // Initialize with optimal values for column-major storage of B.
1061 n_iter = n;
1062 n_elem = m;
1063 lda = a_cs;
1064 inca = a_rs;
1065 ldb = b_cs;
1066 incb = b_rs;
1067
1068 // Handle the transposition of A.
1069 if ( bl1_does_trans( trans ) )
1070 {
1071 bl1_swap_ints( lda, inca );
1072 }
1073
1074 // An optimization: if B is row-major, then let's access the matrix by rows
1075 // instead of by columns for increased spatial locality.
1076 if ( bl1_is_row_storage( b_rs, b_cs ) )
1077 {
1078 bl1_swap_ints( n_iter, n_elem );
1079 bl1_swap_ints( lda, inca );
1080 bl1_swap_ints( ldb, incb );
1081 }
1082 }
1083
1084 // Extract conj component from trans parameter.
1085 conj = bl1_proj_trans1_to_conj( trans );
1086
1087 for ( j = 0; j < n_iter; ++j )
1088 {
1089 a_begin = a + j*lda;
1090 b_begin = b + j*ldb;
1091
1092 bl1_dzcopyv( conj,
1093 n_elem,
1094 a_begin, inca,
1095 b_begin, incb );
1096 }
1097 }
bl1_zdcopymt(trans1_t trans,int m,int n,dcomplex * a,int a_rs,int a_cs,double * b,int b_rs,int b_cs)1098 void bl1_zdcopymt( trans1_t trans, int m, int n, dcomplex* a, int a_rs, int a_cs, double* b, int b_rs, int b_cs )
1099 {
1100 dcomplex* a_begin;
1101 double* b_begin;
1102 int lda, inca;
1103 int ldb, incb;
1104 int n_iter;
1105 int n_elem;
1106 int j;
1107 conj1_t conj;
1108
1109 // Return early if possible.
1110 if ( bl1_zero_dim2( m, n ) ) return;
1111
1112 // Handle cases where A and B are vectors to ensure that the underlying copy
1113 // gets invoked only once.
1114 if ( bl1_is_vector( m, n ) )
1115 {
1116 // Initialize with values appropriate for vectors.
1117 n_iter = 1;
1118 n_elem = bl1_vector_dim( m, n );
1119 lda = 1; // multiplied by zero when n_iter == 1; not needed.
1120 inca = bl1_vector_inc( trans, m, n, a_rs, a_cs );
1121 ldb = 1; // multiplied by zero when n_iter == 1; not needed.
1122 incb = bl1_vector_inc( BLIS1_NO_TRANSPOSE, m, n, b_rs, b_cs );
1123 }
1124 else // matrix case
1125 {
1126 // Initialize with optimal values for column-major storage of B.
1127 n_iter = n;
1128 n_elem = m;
1129 lda = a_cs;
1130 inca = a_rs;
1131 ldb = b_cs;
1132 incb = b_rs;
1133
1134 // Handle the transposition of A.
1135 if ( bl1_does_trans( trans ) )
1136 {
1137 bl1_swap_ints( lda, inca );
1138 }
1139
1140 // An optimization: if B is row-major, then let's access the matrix by rows
1141 // instead of by columns for increased spatial locality.
1142 if ( bl1_is_row_storage( b_rs, b_cs ) )
1143 {
1144 bl1_swap_ints( n_iter, n_elem );
1145 bl1_swap_ints( lda, inca );
1146 bl1_swap_ints( ldb, incb );
1147 }
1148 }
1149
1150 // Extract conj component from trans parameter.
1151 conj = bl1_proj_trans1_to_conj( trans );
1152
1153 for ( j = 0; j < n_iter; ++j )
1154 {
1155 a_begin = a + j*lda;
1156 b_begin = b + j*ldb;
1157
1158 bl1_zdcopyv( conj,
1159 n_elem,
1160 a_begin, inca,
1161 b_begin, incb );
1162 }
1163 }
1164
1165 // cc
bl1_cccopymt(trans1_t trans,int m,int n,scomplex * a,int a_rs,int a_cs,scomplex * b,int b_rs,int b_cs)1166 void bl1_cccopymt( trans1_t trans, int m, int n, scomplex* a, int a_rs, int a_cs, scomplex* b, int b_rs, int b_cs )
1167 {
1168 scomplex* a_begin;
1169 scomplex* b_begin;
1170 int lda, inca;
1171 int ldb, incb;
1172 int n_iter;
1173 int n_elem;
1174 int j;
1175 conj1_t conj;
1176
1177 // Return early if possible.
1178 if ( bl1_zero_dim2( m, n ) ) return;
1179
1180 // Handle cases where A and B are vectors to ensure that the underlying copy
1181 // gets invoked only once.
1182 if ( bl1_is_vector( m, n ) )
1183 {
1184 // Initialize with values appropriate for vectors.
1185 n_iter = 1;
1186 n_elem = bl1_vector_dim( m, n );
1187 lda = 1; // multiplied by zero when n_iter == 1; not needed.
1188 inca = bl1_vector_inc( trans, m, n, a_rs, a_cs );
1189 ldb = 1; // multiplied by zero when n_iter == 1; not needed.
1190 incb = bl1_vector_inc( BLIS1_NO_TRANSPOSE, m, n, b_rs, b_cs );
1191 }
1192 else // matrix case
1193 {
1194 // Initialize with optimal values for column-major storage of B.
1195 n_iter = n;
1196 n_elem = m;
1197 lda = a_cs;
1198 inca = a_rs;
1199 ldb = b_cs;
1200 incb = b_rs;
1201
1202 // Handle the transposition of A.
1203 if ( bl1_does_trans( trans ) )
1204 {
1205 bl1_swap_ints( lda, inca );
1206 }
1207
1208 // An optimization: if B is row-major, then let's access the matrix by rows
1209 // instead of by columns for increased spatial locality.
1210 if ( bl1_is_row_storage( b_rs, b_cs ) )
1211 {
1212 bl1_swap_ints( n_iter, n_elem );
1213 bl1_swap_ints( lda, inca );
1214 bl1_swap_ints( ldb, incb );
1215 }
1216 }
1217
1218 // Extract conj component from trans parameter.
1219 conj = bl1_proj_trans1_to_conj( trans );
1220
1221 for ( j = 0; j < n_iter; ++j )
1222 {
1223 a_begin = a + j*lda;
1224 b_begin = b + j*ldb;
1225
1226 bl1_ccopyv( conj,
1227 n_elem,
1228 a_begin, inca,
1229 b_begin, incb );
1230 }
1231 }
1232
1233 // cz zc
bl1_czcopymt(trans1_t trans,int m,int n,scomplex * a,int a_rs,int a_cs,dcomplex * b,int b_rs,int b_cs)1234 void bl1_czcopymt( trans1_t trans, int m, int n, scomplex* a, int a_rs, int a_cs, dcomplex* b, int b_rs, int b_cs )
1235 {
1236 scomplex* a_begin;
1237 dcomplex* b_begin;
1238 int lda, inca;
1239 int ldb, incb;
1240 int n_iter;
1241 int n_elem;
1242 int j;
1243 conj1_t conj;
1244
1245 // Return early if possible.
1246 if ( bl1_zero_dim2( m, n ) ) return;
1247
1248 // Handle cases where A and B are vectors to ensure that the underlying copy
1249 // gets invoked only once.
1250 if ( bl1_is_vector( m, n ) )
1251 {
1252 // Initialize with values appropriate for vectors.
1253 n_iter = 1;
1254 n_elem = bl1_vector_dim( m, n );
1255 lda = 1; // multiplied by zero when n_iter == 1; not needed.
1256 inca = bl1_vector_inc( trans, m, n, a_rs, a_cs );
1257 ldb = 1; // multiplied by zero when n_iter == 1; not needed.
1258 incb = bl1_vector_inc( BLIS1_NO_TRANSPOSE, m, n, b_rs, b_cs );
1259 }
1260 else // matrix case
1261 {
1262 // Initialize with optimal values for column-major storage of B.
1263 n_iter = n;
1264 n_elem = m;
1265 lda = a_cs;
1266 inca = a_rs;
1267 ldb = b_cs;
1268 incb = b_rs;
1269
1270 // Handle the transposition of A.
1271 if ( bl1_does_trans( trans ) )
1272 {
1273 bl1_swap_ints( lda, inca );
1274 }
1275
1276 // An optimization: if B is row-major, then let's access the matrix by rows
1277 // instead of by columns for increased spatial locality.
1278 if ( bl1_is_row_storage( b_rs, b_cs ) )
1279 {
1280 bl1_swap_ints( n_iter, n_elem );
1281 bl1_swap_ints( lda, inca );
1282 bl1_swap_ints( ldb, incb );
1283 }
1284 }
1285
1286 // Extract conj component from trans parameter.
1287 conj = bl1_proj_trans1_to_conj( trans );
1288
1289 for ( j = 0; j < n_iter; ++j )
1290 {
1291 a_begin = a + j*lda;
1292 b_begin = b + j*ldb;
1293
1294 bl1_czcopyv( conj,
1295 n_elem,
1296 a_begin, inca,
1297 b_begin, incb );
1298 }
1299 }
bl1_zccopymt(trans1_t trans,int m,int n,dcomplex * a,int a_rs,int a_cs,scomplex * b,int b_rs,int b_cs)1300 void bl1_zccopymt( trans1_t trans, int m, int n, dcomplex* a, int a_rs, int a_cs, scomplex* b, int b_rs, int b_cs )
1301 {
1302 dcomplex* a_begin;
1303 scomplex* b_begin;
1304 int lda, inca;
1305 int ldb, incb;
1306 int n_iter;
1307 int n_elem;
1308 int j;
1309 conj1_t conj;
1310
1311 // Return early if possible.
1312 if ( bl1_zero_dim2( m, n ) ) return;
1313
1314 // Handle cases where A and B are vectors to ensure that the underlying copy
1315 // gets invoked only once.
1316 if ( bl1_is_vector( m, n ) )
1317 {
1318 // Initialize with values appropriate for vectors.
1319 n_iter = 1;
1320 n_elem = bl1_vector_dim( m, n );
1321 lda = 1; // multiplied by zero when n_iter == 1; not needed.
1322 inca = bl1_vector_inc( trans, m, n, a_rs, a_cs );
1323 ldb = 1; // multiplied by zero when n_iter == 1; not needed.
1324 incb = bl1_vector_inc( BLIS1_NO_TRANSPOSE, m, n, b_rs, b_cs );
1325 }
1326 else // matrix case
1327 {
1328 // Initialize with optimal values for column-major storage of B.
1329 n_iter = n;
1330 n_elem = m;
1331 lda = a_cs;
1332 inca = a_rs;
1333 ldb = b_cs;
1334 incb = b_rs;
1335
1336 // Handle the transposition of A.
1337 if ( bl1_does_trans( trans ) )
1338 {
1339 bl1_swap_ints( lda, inca );
1340 }
1341
1342 // An optimization: if B is row-major, then let's access the matrix by rows
1343 // instead of by columns for increased spatial locality.
1344 if ( bl1_is_row_storage( b_rs, b_cs ) )
1345 {
1346 bl1_swap_ints( n_iter, n_elem );
1347 bl1_swap_ints( lda, inca );
1348 bl1_swap_ints( ldb, incb );
1349 }
1350 }
1351
1352 // Extract conj component from trans parameter.
1353 conj = bl1_proj_trans1_to_conj( trans );
1354
1355 for ( j = 0; j < n_iter; ++j )
1356 {
1357 a_begin = a + j*lda;
1358 b_begin = b + j*ldb;
1359
1360 bl1_zccopyv( conj,
1361 n_elem,
1362 a_begin, inca,
1363 b_begin, incb );
1364 }
1365 }
1366
1367 // zz
bl1_zzcopymt(trans1_t trans,int m,int n,dcomplex * a,int a_rs,int a_cs,dcomplex * b,int b_rs,int b_cs)1368 void bl1_zzcopymt( trans1_t trans, int m, int n, dcomplex* a, int a_rs, int a_cs, dcomplex* b, int b_rs, int b_cs )
1369 {
1370 dcomplex* a_begin;
1371 dcomplex* b_begin;
1372 int lda, inca;
1373 int ldb, incb;
1374 int n_iter;
1375 int n_elem;
1376 int j;
1377 conj1_t conj;
1378
1379 // Return early if possible.
1380 if ( bl1_zero_dim2( m, n ) ) return;
1381
1382 // Handle cases where A and B are vectors to ensure that the underlying copy
1383 // gets invoked only once.
1384 if ( bl1_is_vector( m, n ) )
1385 {
1386 // Initialize with values appropriate for vectors.
1387 n_iter = 1;
1388 n_elem = bl1_vector_dim( m, n );
1389 lda = 1; // multiplied by zero when n_iter == 1; not needed.
1390 inca = bl1_vector_inc( trans, m, n, a_rs, a_cs );
1391 ldb = 1; // multiplied by zero when n_iter == 1; not needed.
1392 incb = bl1_vector_inc( BLIS1_NO_TRANSPOSE, m, n, b_rs, b_cs );
1393 }
1394 else // matrix case
1395 {
1396 // Initialize with optimal values for column-major storage of B.
1397 n_iter = n;
1398 n_elem = m;
1399 lda = a_cs;
1400 inca = a_rs;
1401 ldb = b_cs;
1402 incb = b_rs;
1403
1404 // Handle the transposition of A.
1405 if ( bl1_does_trans( trans ) )
1406 {
1407 bl1_swap_ints( lda, inca );
1408 }
1409
1410 // An optimization: if B is row-major, then let's access the matrix by rows
1411 // instead of by columns for increased spatial locality.
1412 if ( bl1_is_row_storage( b_rs, b_cs ) )
1413 {
1414 bl1_swap_ints( n_iter, n_elem );
1415 bl1_swap_ints( lda, inca );
1416 bl1_swap_ints( ldb, incb );
1417 }
1418 }
1419
1420 // Extract conj component from trans parameter.
1421 conj = bl1_proj_trans1_to_conj( trans );
1422
1423 for ( j = 0; j < n_iter; ++j )
1424 {
1425 a_begin = a + j*lda;
1426 b_begin = b + j*ldb;
1427
1428 bl1_zcopyv( conj,
1429 n_elem,
1430 a_begin, inca,
1431 b_begin, incb );
1432 }
1433 }
1434
1435