1 /*
2 
3     Copyright (C) 2014, The University of Texas at Austin
4 
5     This file is part of libflame and is available under the 3-Clause
6     BSD license, which can be found in the LICENSE file at the top-level
7     directory, or at http://opensource.org/licenses/BSD-3-Clause
8 
9 */
10 
11 #include "blis1.h"
12 
bl1_icopymt(trans1_t trans,int m,int n,int * a,int a_rs,int a_cs,int * b,int b_rs,int b_cs)13 void bl1_icopymt( trans1_t trans, int m, int n, int* a, int a_rs, int a_cs, int* b, int b_rs, int b_cs )
14 {
15 	int*      a_begin;
16 	int*      b_begin;
17 	int       lda, inca;
18 	int       ldb, incb;
19 	int       n_iter;
20 	int       n_elem;
21 	int       j;
22 
23 	// Return early if possible.
24 	if ( bl1_zero_dim2( m, n ) ) return;
25 
26 	// Handle cases where A and B are vectors to ensure that the underlying copy
27 	// gets invoked only once.
28 	if ( bl1_is_vector( m, n ) )
29 	{
30 		// Initialize with values appropriate for vectors.
31 		n_iter = 1;
32 		n_elem = bl1_vector_dim( m, n );
33 		lda    = 1; // multiplied by zero when n_iter == 1; not needed.
34 		inca   = bl1_vector_inc( trans,             m, n, a_rs, a_cs );
35 		ldb    = 1; // multiplied by zero when n_iter == 1; not needed.
36 		incb   = bl1_vector_inc( BLIS1_NO_TRANSPOSE, m, n, b_rs, b_cs );
37 	}
38 	else // matrix case
39 	{
40 		// Initialize with optimal values for column-major storage.
41 		n_iter = n;
42 		n_elem = m;
43 		lda    = a_cs;
44 		inca   = a_rs;
45 		ldb    = b_cs;
46 		incb   = b_rs;
47 
48 		// Handle the transposition of A.
49 		if ( bl1_does_trans( trans ) )
50 		{
51 			bl1_swap_ints( lda, inca );
52 		}
53 
54 		// An optimization: if B is row-major and if A is effectively row-major
55 		// after a possible transposition, then let's access the matrix by rows
56 		// instead of by columns for increased spatial locality.
57 		if ( bl1_is_row_storage( b_rs, b_cs ) )
58 		{
59 			if ( ( bl1_is_col_storage( a_rs, a_cs ) && bl1_does_trans( trans ) ) ||
60 			     ( bl1_is_row_storage( a_rs, a_cs ) && bl1_does_notrans( trans ) ) )
61 			{
62 				bl1_swap_ints( n_iter, n_elem );
63 				bl1_swap_ints( lda, inca );
64 				bl1_swap_ints( ldb, incb );
65 			}
66 		}
67 	}
68 
69 	for ( j = 0; j < n_iter; j++ )
70 	{
71 		a_begin = a + j*lda;
72 		b_begin = b + j*ldb;
73 
74 		bl1_icopyv( bl1_proj_trans1_to_conj( trans ),
75 		            n_elem,
76 		            a_begin, inca,
77 		            b_begin, incb );
78 	}
79 }
80 
bl1_scopymt(trans1_t trans,int m,int n,float * a,int a_rs,int a_cs,float * b,int b_rs,int b_cs)81 void bl1_scopymt( trans1_t trans, int m, int n, float* a, int a_rs, int a_cs, float* b, int b_rs, int b_cs )
82 {
83 	float*    a_begin;
84 	float*    b_begin;
85 	int       lda, inca;
86 	int       ldb, incb;
87 	int       n_iter;
88 	int       n_elem;
89 	int       j;
90 
91 	// Return early if possible.
92 	if ( bl1_zero_dim2( m, n ) ) return;
93 
94 	// Handle cases where A and B are vectors to ensure that the underlying copy
95 	// gets invoked only once.
96 	if ( bl1_is_vector( m, n ) )
97 	{
98 		// Initialize with values appropriate for vectors.
99 		n_iter = 1;
100 		n_elem = bl1_vector_dim( m, n );
101 		lda    = 1; // multiplied by zero when n_iter == 1; not needed.
102 		inca   = bl1_vector_inc( trans,             m, n, a_rs, a_cs );
103 		ldb    = 1; // multiplied by zero when n_iter == 1; not needed.
104 		incb   = bl1_vector_inc( BLIS1_NO_TRANSPOSE, m, n, b_rs, b_cs );
105 	}
106 	else // matrix case
107 	{
108 		// Initialize with optimal values for column-major storage.
109 		n_iter = n;
110 		n_elem = m;
111 		lda    = a_cs;
112 		inca   = a_rs;
113 		ldb    = b_cs;
114 		incb   = b_rs;
115 
116 		// Handle the transposition of A.
117 		if ( bl1_does_trans( trans ) )
118 		{
119 			bl1_swap_ints( lda, inca );
120 		}
121 
122 		// An optimization: if B is row-major and if A is effectively row-major
123 		// after a possible transposition, then let's access the matrix by rows
124 		// instead of by columns for increased spatial locality.
125 		if ( bl1_is_row_storage( b_rs, b_cs ) )
126 		{
127 			if ( ( bl1_is_col_storage( a_rs, a_cs ) && bl1_does_trans( trans ) ) ||
128 			     ( bl1_is_row_storage( a_rs, a_cs ) && bl1_does_notrans( trans ) ) )
129 			{
130 				bl1_swap_ints( n_iter, n_elem );
131 				bl1_swap_ints( lda, inca );
132 				bl1_swap_ints( ldb, incb );
133 			}
134 		}
135 	}
136 
137 	for ( j = 0; j < n_iter; j++ )
138 	{
139 		a_begin = a + j*lda;
140 		b_begin = b + j*ldb;
141 
142 		bl1_scopy( n_elem,
143 		           a_begin, inca,
144 		           b_begin, incb );
145 	}
146 }
147 
bl1_dcopymt(trans1_t trans,int m,int n,double * a,int a_rs,int a_cs,double * b,int b_rs,int b_cs)148 void bl1_dcopymt( trans1_t trans, int m, int n, double* a, int a_rs, int a_cs, double* b, int b_rs, int b_cs )
149 {
150 	double*   a_begin;
151 	double*   b_begin;
152 	int       lda, inca;
153 	int       ldb, incb;
154 	int       n_iter;
155 	int       n_elem;
156 	int       j;
157 
158 	// Return early if possible.
159 	if ( bl1_zero_dim2( m, n ) ) return;
160 
161 	// Handle cases where A and B are vectors to ensure that the underlying copy
162 	// gets invoked only once.
163 	if ( bl1_is_vector( m, n ) )
164 	{
165 		// Initialize with values appropriate for vectors.
166 		n_iter = 1;
167 		n_elem = bl1_vector_dim( m, n );
168 		lda    = 1; // multiplied by zero when n_iter == 1; not needed.
169 		inca   = bl1_vector_inc( trans,             m, n, a_rs, a_cs );
170 		ldb    = 1; // multiplied by zero when n_iter == 1; not needed.
171 		incb   = bl1_vector_inc( BLIS1_NO_TRANSPOSE, m, n, b_rs, b_cs );
172 	}
173 	else // matrix case
174 	{
175 		// Initialize with optimal values for column-major storage.
176 		n_iter = n;
177 		n_elem = m;
178 		lda    = a_cs;
179 		inca   = a_rs;
180 		ldb    = b_cs;
181 		incb   = b_rs;
182 
183 		// Handle the transposition of A.
184 		if ( bl1_does_trans( trans ) )
185 		{
186 			bl1_swap_ints( lda, inca );
187 		}
188 
189 		// An optimization: if B is row-major and if A is effectively row-major
190 		// after a possible transposition, then let's access the matrix by rows
191 		// instead of by columns for increased spatial locality.
192 		if ( bl1_is_row_storage( b_rs, b_cs ) )
193 		{
194 			if ( ( bl1_is_col_storage( a_rs, a_cs ) && bl1_does_trans( trans ) ) ||
195 			     ( bl1_is_row_storage( a_rs, a_cs ) && bl1_does_notrans( trans ) ) )
196 			{
197 				bl1_swap_ints( n_iter, n_elem );
198 				bl1_swap_ints( lda, inca );
199 				bl1_swap_ints( ldb, incb );
200 			}
201 		}
202 	}
203 
204 	for ( j = 0; j < n_iter; j++ )
205 	{
206 		a_begin = a + j*lda;
207 		b_begin = b + j*ldb;
208 
209 		bl1_dcopy( n_elem,
210 		           a_begin, inca,
211 		           b_begin, incb );
212 	}
213 }
214 
bl1_ccopymt(trans1_t trans,int m,int n,scomplex * a,int a_rs,int a_cs,scomplex * b,int b_rs,int b_cs)215 void bl1_ccopymt( trans1_t trans, int m, int n, scomplex* a, int a_rs, int a_cs, scomplex* b, int b_rs, int b_cs )
216 {
217 	scomplex* a_begin;
218 	scomplex* b_begin;
219 	int       lda, inca;
220 	int       ldb, incb;
221 	int       n_iter;
222 	int       n_elem;
223 	int       j;
224 
225 	// Return early if possible.
226 	if ( bl1_zero_dim2( m, n ) ) return;
227 
228 	// Handle cases where A and B are vectors to ensure that the underlying copy
229 	// gets invoked only once.
230 	if ( bl1_is_vector( m, n ) )
231 	{
232 		// Initialize with values appropriate for vectors.
233 		n_iter = 1;
234 		n_elem = bl1_vector_dim( m, n );
235 		lda    = 1; // multiplied by zero when n_iter == 1; not needed.
236 		inca   = bl1_vector_inc( trans,             m, n, a_rs, a_cs );
237 		ldb    = 1; // multiplied by zero when n_iter == 1; not needed.
238 		incb   = bl1_vector_inc( BLIS1_NO_TRANSPOSE, m, n, b_rs, b_cs );
239 	}
240 	else // matrix case
241 	{
242 		// Initialize with optimal values for column-major storage.
243 		n_iter = n;
244 		n_elem = m;
245 		lda    = a_cs;
246 		inca   = a_rs;
247 		ldb    = b_cs;
248 		incb   = b_rs;
249 
250 		// Handle the transposition of A.
251 		if ( bl1_does_trans( trans ) )
252 		{
253 			bl1_swap_ints( lda, inca );
254 		}
255 
256 		// An optimization: if B is row-major and if A is effectively row-major
257 		// after a possible transposition, then let's access the matrix by rows
258 		// instead of by columns for increased spatial locality.
259 		if ( bl1_is_row_storage( b_rs, b_cs ) )
260 		{
261 			if ( ( bl1_is_col_storage( a_rs, a_cs ) && bl1_does_trans( trans ) ) ||
262 			     ( bl1_is_row_storage( a_rs, a_cs ) && bl1_does_notrans( trans ) ) )
263 			{
264 				bl1_swap_ints( n_iter, n_elem );
265 				bl1_swap_ints( lda, inca );
266 				bl1_swap_ints( ldb, incb );
267 			}
268 		}
269 	}
270 
271 	for ( j = 0; j < n_iter; j++ )
272 	{
273 		a_begin = a + j*lda;
274 		b_begin = b + j*ldb;
275 
276 		bl1_ccopy( n_elem,
277 		           a_begin, inca,
278 		           b_begin, incb );
279 
280 		if ( bl1_does_conj( trans ) )
281 			bl1_cconjv( n_elem,
282 			            b_begin, incb );
283 	}
284 }
285 
bl1_zcopymt(trans1_t trans,int m,int n,dcomplex * a,int a_rs,int a_cs,dcomplex * b,int b_rs,int b_cs)286 void bl1_zcopymt( trans1_t trans, int m, int n, dcomplex* a, int a_rs, int a_cs, dcomplex* b, int b_rs, int b_cs )
287 {
288 	dcomplex* a_begin;
289 	dcomplex* b_begin;
290 	int       lda, inca;
291 	int       ldb, incb;
292 	int       n_iter;
293 	int       n_elem;
294 	int       j;
295 
296 	// Return early if possible.
297 	if ( bl1_zero_dim2( m, n ) ) return;
298 
299 	// Handle cases where A and B are vectors to ensure that the underlying copy
300 	// gets invoked only once.
301 	if ( bl1_is_vector( m, n ) )
302 	{
303 		// Initialize with values appropriate for vectors.
304 		n_iter = 1;
305 		n_elem = bl1_vector_dim( m, n );
306 		lda    = 1; // multiplied by zero when n_iter == 1; not needed.
307 		inca   = bl1_vector_inc( trans,             m, n, a_rs, a_cs );
308 		ldb    = 1; // multiplied by zero when n_iter == 1; not needed.
309 		incb   = bl1_vector_inc( BLIS1_NO_TRANSPOSE, m, n, b_rs, b_cs );
310 	}
311 	else // matrix case
312 	{
313 		// Initialize with optimal values for column-major storage.
314 		n_iter = n;
315 		n_elem = m;
316 		lda    = a_cs;
317 		inca   = a_rs;
318 		ldb    = b_cs;
319 		incb   = b_rs;
320 
321 		// Handle the transposition of A.
322 		if ( bl1_does_trans( trans ) )
323 		{
324 			bl1_swap_ints( lda, inca );
325 		}
326 
327 		// An optimization: if B is row-major and if A is effectively row-major
328 		// after a possible transposition, then let's access the matrix by rows
329 		// instead of by columns for increased spatial locality.
330 		if ( bl1_is_row_storage( b_rs, b_cs ) )
331 		{
332 			if ( ( bl1_is_col_storage( a_rs, a_cs ) && bl1_does_trans( trans ) ) ||
333 			     ( bl1_is_row_storage( a_rs, a_cs ) && bl1_does_notrans( trans ) ) )
334 			{
335 				bl1_swap_ints( n_iter, n_elem );
336 				bl1_swap_ints( lda, inca );
337 				bl1_swap_ints( ldb, incb );
338 			}
339 		}
340 	}
341 
342 	for ( j = 0; j < n_iter; j++ )
343 	{
344 		a_begin = a + j*lda;
345 		b_begin = b + j*ldb;
346 
347 		bl1_zcopy( n_elem,
348 		           a_begin, inca,
349 		           b_begin, incb );
350 
351 		if ( bl1_does_conj( trans ) )
352 			bl1_zconjv( n_elem,
353 			            b_begin, incb );
354 	}
355 }
356 
357 // --- Mixed-datatype and general stride copy routines---------------
358 
359 // ss
bl1_sscopymt(trans1_t trans,int m,int n,float * a,int a_rs,int a_cs,float * b,int b_rs,int b_cs)360 void bl1_sscopymt( trans1_t trans, int m, int n, float* a, int a_rs, int a_cs, float* b, int b_rs, int b_cs )
361 {
362 	float*    a_begin;
363 	float*    b_begin;
364 	int       lda, inca;
365 	int       ldb, incb;
366 	int       n_iter;
367 	int       n_elem;
368 	int       j;
369 	conj1_t    conj;
370 
371 	// Return early if possible.
372 	if ( bl1_zero_dim2( m, n ) ) return;
373 
374 	// Handle cases where A and B are vectors to ensure that the underlying copy
375 	// gets invoked only once.
376 	if ( bl1_is_vector( m, n ) )
377 	{
378 		// Initialize with values appropriate for vectors.
379 		n_iter = 1;
380 		n_elem = bl1_vector_dim( m, n );
381 		lda    = 1; // multiplied by zero when n_iter == 1; not needed.
382 		inca   = bl1_vector_inc( trans,             m, n, a_rs, a_cs );
383 		ldb    = 1; // multiplied by zero when n_iter == 1; not needed.
384 		incb   = bl1_vector_inc( BLIS1_NO_TRANSPOSE, m, n, b_rs, b_cs );
385 	}
386 	else // matrix case
387 	{
388 		// Initialize with optimal values for column-major storage of B.
389 		n_iter = n;
390 		n_elem = m;
391 		lda    = a_cs;
392 		inca   = a_rs;
393 		ldb    = b_cs;
394 		incb   = b_rs;
395 
396 		// Handle the transposition of A.
397 		if ( bl1_does_trans( trans ) )
398 		{
399 			bl1_swap_ints( lda, inca );
400 		}
401 
402 		// An optimization: if B is row-major, then let's access the matrix by rows
403 		// instead of by columns for increased spatial locality.
404 		if ( bl1_is_row_storage( b_rs, b_cs ) )
405 		{
406 			bl1_swap_ints( n_iter, n_elem );
407 			bl1_swap_ints( lda, inca );
408 			bl1_swap_ints( ldb, incb );
409 		}
410 	}
411 
412 	// Extract conj component from trans parameter.
413 	conj = bl1_proj_trans1_to_conj( trans );
414 
415 	for ( j = 0; j < n_iter; ++j )
416 	{
417 		a_begin = a + j*lda;
418 		b_begin = b + j*ldb;
419 
420 		bl1_scopyv( conj,
421 		            n_elem,
422 		            a_begin, inca,
423 		            b_begin, incb );
424 	}
425 }
426 
427 // sd ds
bl1_sdcopymt(trans1_t trans,int m,int n,float * a,int a_rs,int a_cs,double * b,int b_rs,int b_cs)428 void bl1_sdcopymt( trans1_t trans, int m, int n, float* a, int a_rs, int a_cs, double* b, int b_rs, int b_cs )
429 {
430 	float*    a_begin;
431 	double*   b_begin;
432 	int       lda, inca;
433 	int       ldb, incb;
434 	int       n_iter;
435 	int       n_elem;
436 	int       j;
437 	conj1_t    conj;
438 
439 	// Return early if possible.
440 	if ( bl1_zero_dim2( m, n ) ) return;
441 
442 	// Handle cases where A and B are vectors to ensure that the underlying copy
443 	// gets invoked only once.
444 	if ( bl1_is_vector( m, n ) )
445 	{
446 		// Initialize with values appropriate for vectors.
447 		n_iter = 1;
448 		n_elem = bl1_vector_dim( m, n );
449 		lda    = 1; // multiplied by zero when n_iter == 1; not needed.
450 		inca   = bl1_vector_inc( trans,             m, n, a_rs, a_cs );
451 		ldb    = 1; // multiplied by zero when n_iter == 1; not needed.
452 		incb   = bl1_vector_inc( BLIS1_NO_TRANSPOSE, m, n, b_rs, b_cs );
453 	}
454 	else // matrix case
455 	{
456 		// Initialize with optimal values for column-major storage of B.
457 		n_iter = n;
458 		n_elem = m;
459 		lda    = a_cs;
460 		inca   = a_rs;
461 		ldb    = b_cs;
462 		incb   = b_rs;
463 
464 		// Handle the transposition of A.
465 		if ( bl1_does_trans( trans ) )
466 		{
467 			bl1_swap_ints( lda, inca );
468 		}
469 
470 		// An optimization: if B is row-major, then let's access the matrix by rows
471 		// instead of by columns for increased spatial locality.
472 		if ( bl1_is_row_storage( b_rs, b_cs ) )
473 		{
474 			bl1_swap_ints( n_iter, n_elem );
475 			bl1_swap_ints( lda, inca );
476 			bl1_swap_ints( ldb, incb );
477 		}
478 	}
479 
480 	// Extract conj component from trans parameter.
481 	conj = bl1_proj_trans1_to_conj( trans );
482 
483 	for ( j = 0; j < n_iter; ++j )
484 	{
485 		a_begin = a + j*lda;
486 		b_begin = b + j*ldb;
487 
488 		bl1_sdcopyv( conj,
489 		             n_elem,
490 		             a_begin, inca,
491 		             b_begin, incb );
492 	}
493 }
bl1_dscopymt(trans1_t trans,int m,int n,double * a,int a_rs,int a_cs,float * b,int b_rs,int b_cs)494 void bl1_dscopymt( trans1_t trans, int m, int n, double* a, int a_rs, int a_cs, float* b, int b_rs, int b_cs )
495 {
496 	double*   a_begin;
497 	float*    b_begin;
498 	int       lda, inca;
499 	int       ldb, incb;
500 	int       n_iter;
501 	int       n_elem;
502 	int       j;
503 	conj1_t    conj;
504 
505 	// Return early if possible.
506 	if ( bl1_zero_dim2( m, n ) ) return;
507 
508 	// Handle cases where A and B are vectors to ensure that the underlying copy
509 	// gets invoked only once.
510 	if ( bl1_is_vector( m, n ) )
511 	{
512 		// Initialize with values appropriate for vectors.
513 		n_iter = 1;
514 		n_elem = bl1_vector_dim( m, n );
515 		lda    = 1; // multiplied by zero when n_iter == 1; not needed.
516 		inca   = bl1_vector_inc( trans,             m, n, a_rs, a_cs );
517 		ldb    = 1; // multiplied by zero when n_iter == 1; not needed.
518 		incb   = bl1_vector_inc( BLIS1_NO_TRANSPOSE, m, n, b_rs, b_cs );
519 	}
520 	else // matrix case
521 	{
522 		// Initialize with optimal values for column-major storage of B.
523 		n_iter = n;
524 		n_elem = m;
525 		lda    = a_cs;
526 		inca   = a_rs;
527 		ldb    = b_cs;
528 		incb   = b_rs;
529 
530 		// Handle the transposition of A.
531 		if ( bl1_does_trans( trans ) )
532 		{
533 			bl1_swap_ints( lda, inca );
534 		}
535 
536 		// An optimization: if B is row-major, then let's access the matrix by rows
537 		// instead of by columns for increased spatial locality.
538 		if ( bl1_is_row_storage( b_rs, b_cs ) )
539 		{
540 			bl1_swap_ints( n_iter, n_elem );
541 			bl1_swap_ints( lda, inca );
542 			bl1_swap_ints( ldb, incb );
543 		}
544 	}
545 
546 	// Extract conj component from trans parameter.
547 	conj = bl1_proj_trans1_to_conj( trans );
548 
549 	for ( j = 0; j < n_iter; ++j )
550 	{
551 		a_begin = a + j*lda;
552 		b_begin = b + j*ldb;
553 
554 		bl1_dscopyv( conj,
555 		             n_elem,
556 		             a_begin, inca,
557 		             b_begin, incb );
558 	}
559 }
560 
561 // sc cs
bl1_sccopymt(trans1_t trans,int m,int n,float * a,int a_rs,int a_cs,scomplex * b,int b_rs,int b_cs)562 void bl1_sccopymt( trans1_t trans, int m, int n, float* a, int a_rs, int a_cs, scomplex* b, int b_rs, int b_cs )
563 {
564 	float*    a_begin;
565 	scomplex* b_begin;
566 	int       lda, inca;
567 	int       ldb, incb;
568 	int       n_iter;
569 	int       n_elem;
570 	int       j;
571 	conj1_t    conj;
572 
573 	// Return early if possible.
574 	if ( bl1_zero_dim2( m, n ) ) return;
575 
576 	// Handle cases where A and B are vectors to ensure that the underlying copy
577 	// gets invoked only once.
578 	if ( bl1_is_vector( m, n ) )
579 	{
580 		// Initialize with values appropriate for vectors.
581 		n_iter = 1;
582 		n_elem = bl1_vector_dim( m, n );
583 		lda    = 1; // multiplied by zero when n_iter == 1; not needed.
584 		inca   = bl1_vector_inc( trans,             m, n, a_rs, a_cs );
585 		ldb    = 1; // multiplied by zero when n_iter == 1; not needed.
586 		incb   = bl1_vector_inc( BLIS1_NO_TRANSPOSE, m, n, b_rs, b_cs );
587 	}
588 	else // matrix case
589 	{
590 		// Initialize with optimal values for column-major storage of B.
591 		n_iter = n;
592 		n_elem = m;
593 		lda    = a_cs;
594 		inca   = a_rs;
595 		ldb    = b_cs;
596 		incb   = b_rs;
597 
598 		// Handle the transposition of A.
599 		if ( bl1_does_trans( trans ) )
600 		{
601 			bl1_swap_ints( lda, inca );
602 		}
603 
604 		// An optimization: if B is row-major, then let's access the matrix by rows
605 		// instead of by columns for increased spatial locality.
606 		if ( bl1_is_row_storage( b_rs, b_cs ) )
607 		{
608 			bl1_swap_ints( n_iter, n_elem );
609 			bl1_swap_ints( lda, inca );
610 			bl1_swap_ints( ldb, incb );
611 		}
612 	}
613 
614 	// Extract conj component from trans parameter.
615 	conj = bl1_proj_trans1_to_conj( trans );
616 
617 	for ( j = 0; j < n_iter; ++j )
618 	{
619 		a_begin = a + j*lda;
620 		b_begin = b + j*ldb;
621 
622 		bl1_sccopyv( conj,
623 		             n_elem,
624 		             a_begin, inca,
625 		             b_begin, incb );
626 	}
627 }
bl1_cscopymt(trans1_t trans,int m,int n,scomplex * a,int a_rs,int a_cs,float * b,int b_rs,int b_cs)628 void bl1_cscopymt( trans1_t trans, int m, int n, scomplex* a, int a_rs, int a_cs, float* b, int b_rs, int b_cs )
629 {
630 	scomplex* a_begin;
631 	float*    b_begin;
632 	int       lda, inca;
633 	int       ldb, incb;
634 	int       n_iter;
635 	int       n_elem;
636 	int       j;
637 	conj1_t    conj;
638 
639 	// Return early if possible.
640 	if ( bl1_zero_dim2( m, n ) ) return;
641 
642 	// Handle cases where A and B are vectors to ensure that the underlying copy
643 	// gets invoked only once.
644 	if ( bl1_is_vector( m, n ) )
645 	{
646 		// Initialize with values appropriate for vectors.
647 		n_iter = 1;
648 		n_elem = bl1_vector_dim( m, n );
649 		lda    = 1; // multiplied by zero when n_iter == 1; not needed.
650 		inca   = bl1_vector_inc( trans,             m, n, a_rs, a_cs );
651 		ldb    = 1; // multiplied by zero when n_iter == 1; not needed.
652 		incb   = bl1_vector_inc( BLIS1_NO_TRANSPOSE, m, n, b_rs, b_cs );
653 	}
654 	else // matrix case
655 	{
656 		// Initialize with optimal values for column-major storage of B.
657 		n_iter = n;
658 		n_elem = m;
659 		lda    = a_cs;
660 		inca   = a_rs;
661 		ldb    = b_cs;
662 		incb   = b_rs;
663 
664 		// Handle the transposition of A.
665 		if ( bl1_does_trans( trans ) )
666 		{
667 			bl1_swap_ints( lda, inca );
668 		}
669 
670 		// An optimization: if B is row-major, then let's access the matrix by rows
671 		// instead of by columns for increased spatial locality.
672 		if ( bl1_is_row_storage( b_rs, b_cs ) )
673 		{
674 			bl1_swap_ints( n_iter, n_elem );
675 			bl1_swap_ints( lda, inca );
676 			bl1_swap_ints( ldb, incb );
677 		}
678 	}
679 
680 	// Extract conj component from trans parameter.
681 	conj = bl1_proj_trans1_to_conj( trans );
682 
683 	for ( j = 0; j < n_iter; ++j )
684 	{
685 		a_begin = a + j*lda;
686 		b_begin = b + j*ldb;
687 
688 		bl1_cscopyv( conj,
689 		             n_elem,
690 		             a_begin, inca,
691 		             b_begin, incb );
692 	}
693 }
694 
695 // sz zs
bl1_szcopymt(trans1_t trans,int m,int n,float * a,int a_rs,int a_cs,dcomplex * b,int b_rs,int b_cs)696 void bl1_szcopymt( trans1_t trans, int m, int n, float* a, int a_rs, int a_cs, dcomplex* b, int b_rs, int b_cs )
697 {
698 	float*    a_begin;
699 	dcomplex* b_begin;
700 	int       lda, inca;
701 	int       ldb, incb;
702 	int       n_iter;
703 	int       n_elem;
704 	int       j;
705 	conj1_t    conj;
706 
707 	// Return early if possible.
708 	if ( bl1_zero_dim2( m, n ) ) return;
709 
710 	// Handle cases where A and B are vectors to ensure that the underlying copy
711 	// gets invoked only once.
712 	if ( bl1_is_vector( m, n ) )
713 	{
714 		// Initialize with values appropriate for vectors.
715 		n_iter = 1;
716 		n_elem = bl1_vector_dim( m, n );
717 		lda    = 1; // multiplied by zero when n_iter == 1; not needed.
718 		inca   = bl1_vector_inc( trans,             m, n, a_rs, a_cs );
719 		ldb    = 1; // multiplied by zero when n_iter == 1; not needed.
720 		incb   = bl1_vector_inc( BLIS1_NO_TRANSPOSE, m, n, b_rs, b_cs );
721 	}
722 	else // matrix case
723 	{
724 		// Initialize with optimal values for column-major storage of B.
725 		n_iter = n;
726 		n_elem = m;
727 		lda    = a_cs;
728 		inca   = a_rs;
729 		ldb    = b_cs;
730 		incb   = b_rs;
731 
732 		// Handle the transposition of A.
733 		if ( bl1_does_trans( trans ) )
734 		{
735 			bl1_swap_ints( lda, inca );
736 		}
737 
738 		// An optimization: if B is row-major, then let's access the matrix by rows
739 		// instead of by columns for increased spatial locality.
740 		if ( bl1_is_row_storage( b_rs, b_cs ) )
741 		{
742 			bl1_swap_ints( n_iter, n_elem );
743 			bl1_swap_ints( lda, inca );
744 			bl1_swap_ints( ldb, incb );
745 		}
746 	}
747 
748 	// Extract conj component from trans parameter.
749 	conj = bl1_proj_trans1_to_conj( trans );
750 
751 	for ( j = 0; j < n_iter; ++j )
752 	{
753 		a_begin = a + j*lda;
754 		b_begin = b + j*ldb;
755 
756 		bl1_szcopyv( conj,
757 		             n_elem,
758 		             a_begin, inca,
759 		             b_begin, incb );
760 	}
761 }
bl1_zscopymt(trans1_t trans,int m,int n,dcomplex * a,int a_rs,int a_cs,float * b,int b_rs,int b_cs)762 void bl1_zscopymt( trans1_t trans, int m, int n, dcomplex* a, int a_rs, int a_cs, float* b, int b_rs, int b_cs )
763 {
764 	dcomplex* a_begin;
765 	float*    b_begin;
766 	int       lda, inca;
767 	int       ldb, incb;
768 	int       n_iter;
769 	int       n_elem;
770 	int       j;
771 	conj1_t    conj;
772 
773 	// Return early if possible.
774 	if ( bl1_zero_dim2( m, n ) ) return;
775 
776 	// Handle cases where A and B are vectors to ensure that the underlying copy
777 	// gets invoked only once.
778 	if ( bl1_is_vector( m, n ) )
779 	{
780 		// Initialize with values appropriate for vectors.
781 		n_iter = 1;
782 		n_elem = bl1_vector_dim( m, n );
783 		lda    = 1; // multiplied by zero when n_iter == 1; not needed.
784 		inca   = bl1_vector_inc( trans,             m, n, a_rs, a_cs );
785 		ldb    = 1; // multiplied by zero when n_iter == 1; not needed.
786 		incb   = bl1_vector_inc( BLIS1_NO_TRANSPOSE, m, n, b_rs, b_cs );
787 	}
788 	else // matrix case
789 	{
790 		// Initialize with optimal values for column-major storage of B.
791 		n_iter = n;
792 		n_elem = m;
793 		lda    = a_cs;
794 		inca   = a_rs;
795 		ldb    = b_cs;
796 		incb   = b_rs;
797 
798 		// Handle the transposition of A.
799 		if ( bl1_does_trans( trans ) )
800 		{
801 			bl1_swap_ints( lda, inca );
802 		}
803 
804 		// An optimization: if B is row-major, then let's access the matrix by rows
805 		// instead of by columns for increased spatial locality.
806 		if ( bl1_is_row_storage( b_rs, b_cs ) )
807 		{
808 			bl1_swap_ints( n_iter, n_elem );
809 			bl1_swap_ints( lda, inca );
810 			bl1_swap_ints( ldb, incb );
811 		}
812 	}
813 
814 	// Extract conj component from trans parameter.
815 	conj = bl1_proj_trans1_to_conj( trans );
816 
817 	for ( j = 0; j < n_iter; ++j )
818 	{
819 		a_begin = a + j*lda;
820 		b_begin = b + j*ldb;
821 
822 		bl1_zscopyv( conj,
823 		             n_elem,
824 		             a_begin, inca,
825 		             b_begin, incb );
826 	}
827 }
828 
829 // dd
bl1_ddcopymt(trans1_t trans,int m,int n,double * a,int a_rs,int a_cs,double * b,int b_rs,int b_cs)830 void bl1_ddcopymt( trans1_t trans, int m, int n, double* a, int a_rs, int a_cs, double* b, int b_rs, int b_cs )
831 {
832 	double*   a_begin;
833 	double*   b_begin;
834 	int       lda, inca;
835 	int       ldb, incb;
836 	int       n_iter;
837 	int       n_elem;
838 	int       j;
839 	conj1_t    conj;
840 
841 	// Return early if possible.
842 	if ( bl1_zero_dim2( m, n ) ) return;
843 
844 	// Handle cases where A and B are vectors to ensure that the underlying copy
845 	// gets invoked only once.
846 	if ( bl1_is_vector( m, n ) )
847 	{
848 		// Initialize with values appropriate for vectors.
849 		n_iter = 1;
850 		n_elem = bl1_vector_dim( m, n );
851 		lda    = 1; // multiplied by zero when n_iter == 1; not needed.
852 		inca   = bl1_vector_inc( trans,             m, n, a_rs, a_cs );
853 		ldb    = 1; // multiplied by zero when n_iter == 1; not needed.
854 		incb   = bl1_vector_inc( BLIS1_NO_TRANSPOSE, m, n, b_rs, b_cs );
855 	}
856 	else // matrix case
857 	{
858 		// Initialize with optimal values for column-major storage of B.
859 		n_iter = n;
860 		n_elem = m;
861 		lda    = a_cs;
862 		inca   = a_rs;
863 		ldb    = b_cs;
864 		incb   = b_rs;
865 
866 		// Handle the transposition of A.
867 		if ( bl1_does_trans( trans ) )
868 		{
869 			bl1_swap_ints( lda, inca );
870 		}
871 
872 		// An optimization: if B is row-major, then let's access the matrix by rows
873 		// instead of by columns for increased spatial locality.
874 		if ( bl1_is_row_storage( b_rs, b_cs ) )
875 		{
876 			bl1_swap_ints( n_iter, n_elem );
877 			bl1_swap_ints( lda, inca );
878 			bl1_swap_ints( ldb, incb );
879 		}
880 	}
881 
882 	// Extract conj component from trans parameter.
883 	conj = bl1_proj_trans1_to_conj( trans );
884 
885 	for ( j = 0; j < n_iter; ++j )
886 	{
887 		a_begin = a + j*lda;
888 		b_begin = b + j*ldb;
889 
890 		bl1_dcopyv( conj,
891 		            n_elem,
892 		            a_begin, inca,
893 		            b_begin, incb );
894 	}
895 }
896 
897 // dc cd
bl1_dccopymt(trans1_t trans,int m,int n,double * a,int a_rs,int a_cs,scomplex * b,int b_rs,int b_cs)898 void bl1_dccopymt( trans1_t trans, int m, int n, double* a, int a_rs, int a_cs, scomplex* b, int b_rs, int b_cs )
899 {
900 	double*   a_begin;
901 	scomplex* b_begin;
902 	int       lda, inca;
903 	int       ldb, incb;
904 	int       n_iter;
905 	int       n_elem;
906 	int       j;
907 	conj1_t    conj;
908 
909 	// Return early if possible.
910 	if ( bl1_zero_dim2( m, n ) ) return;
911 
912 	// Handle cases where A and B are vectors to ensure that the underlying copy
913 	// gets invoked only once.
914 	if ( bl1_is_vector( m, n ) )
915 	{
916 		// Initialize with values appropriate for vectors.
917 		n_iter = 1;
918 		n_elem = bl1_vector_dim( m, n );
919 		lda    = 1; // multiplied by zero when n_iter == 1; not needed.
920 		inca   = bl1_vector_inc( trans,             m, n, a_rs, a_cs );
921 		ldb    = 1; // multiplied by zero when n_iter == 1; not needed.
922 		incb   = bl1_vector_inc( BLIS1_NO_TRANSPOSE, m, n, b_rs, b_cs );
923 	}
924 	else // matrix case
925 	{
926 		// Initialize with optimal values for column-major storage of B.
927 		n_iter = n;
928 		n_elem = m;
929 		lda    = a_cs;
930 		inca   = a_rs;
931 		ldb    = b_cs;
932 		incb   = b_rs;
933 
934 		// Handle the transposition of A.
935 		if ( bl1_does_trans( trans ) )
936 		{
937 			bl1_swap_ints( lda, inca );
938 		}
939 
940 		// An optimization: if B is row-major, then let's access the matrix by rows
941 		// instead of by columns for increased spatial locality.
942 		if ( bl1_is_row_storage( b_rs, b_cs ) )
943 		{
944 			bl1_swap_ints( n_iter, n_elem );
945 			bl1_swap_ints( lda, inca );
946 			bl1_swap_ints( ldb, incb );
947 		}
948 	}
949 
950 	// Extract conj component from trans parameter.
951 	conj = bl1_proj_trans1_to_conj( trans );
952 
953 	for ( j = 0; j < n_iter; ++j )
954 	{
955 		a_begin = a + j*lda;
956 		b_begin = b + j*ldb;
957 
958 		bl1_dccopyv( conj,
959 		             n_elem,
960 		             a_begin, inca,
961 		             b_begin, incb );
962 	}
963 }
bl1_cdcopymt(trans1_t trans,int m,int n,scomplex * a,int a_rs,int a_cs,double * b,int b_rs,int b_cs)964 void bl1_cdcopymt( trans1_t trans, int m, int n, scomplex* a, int a_rs, int a_cs, double* b, int b_rs, int b_cs )
965 {
966 	scomplex* a_begin;
967 	double*   b_begin;
968 	int       lda, inca;
969 	int       ldb, incb;
970 	int       n_iter;
971 	int       n_elem;
972 	int       j;
973 	conj1_t    conj;
974 
975 	// Return early if possible.
976 	if ( bl1_zero_dim2( m, n ) ) return;
977 
978 	// Handle cases where A and B are vectors to ensure that the underlying copy
979 	// gets invoked only once.
980 	if ( bl1_is_vector( m, n ) )
981 	{
982 		// Initialize with values appropriate for vectors.
983 		n_iter = 1;
984 		n_elem = bl1_vector_dim( m, n );
985 		lda    = 1; // multiplied by zero when n_iter == 1; not needed.
986 		inca   = bl1_vector_inc( trans,             m, n, a_rs, a_cs );
987 		ldb    = 1; // multiplied by zero when n_iter == 1; not needed.
988 		incb   = bl1_vector_inc( BLIS1_NO_TRANSPOSE, m, n, b_rs, b_cs );
989 	}
990 	else // matrix case
991 	{
992 		// Initialize with optimal values for column-major storage of B.
993 		n_iter = n;
994 		n_elem = m;
995 		lda    = a_cs;
996 		inca   = a_rs;
997 		ldb    = b_cs;
998 		incb   = b_rs;
999 
1000 		// Handle the transposition of A.
1001 		if ( bl1_does_trans( trans ) )
1002 		{
1003 			bl1_swap_ints( lda, inca );
1004 		}
1005 
1006 		// An optimization: if B is row-major, then let's access the matrix by rows
1007 		// instead of by columns for increased spatial locality.
1008 		if ( bl1_is_row_storage( b_rs, b_cs ) )
1009 		{
1010 			bl1_swap_ints( n_iter, n_elem );
1011 			bl1_swap_ints( lda, inca );
1012 			bl1_swap_ints( ldb, incb );
1013 		}
1014 	}
1015 
1016 	// Extract conj component from trans parameter.
1017 	conj = bl1_proj_trans1_to_conj( trans );
1018 
1019 	for ( j = 0; j < n_iter; ++j )
1020 	{
1021 		a_begin = a + j*lda;
1022 		b_begin = b + j*ldb;
1023 
1024 		bl1_cdcopyv( conj,
1025 		             n_elem,
1026 		             a_begin, inca,
1027 		             b_begin, incb );
1028 	}
1029 }
1030 
1031 // dz zd
bl1_dzcopymt(trans1_t trans,int m,int n,double * a,int a_rs,int a_cs,dcomplex * b,int b_rs,int b_cs)1032 void bl1_dzcopymt( trans1_t trans, int m, int n, double* a, int a_rs, int a_cs, dcomplex* b, int b_rs, int b_cs )
1033 {
1034 	double*   a_begin;
1035 	dcomplex* b_begin;
1036 	int       lda, inca;
1037 	int       ldb, incb;
1038 	int       n_iter;
1039 	int       n_elem;
1040 	int       j;
1041 	conj1_t    conj;
1042 
1043 	// Return early if possible.
1044 	if ( bl1_zero_dim2( m, n ) ) return;
1045 
1046 	// Handle cases where A and B are vectors to ensure that the underlying copy
1047 	// gets invoked only once.
1048 	if ( bl1_is_vector( m, n ) )
1049 	{
1050 		// Initialize with values appropriate for vectors.
1051 		n_iter = 1;
1052 		n_elem = bl1_vector_dim( m, n );
1053 		lda    = 1; // multiplied by zero when n_iter == 1; not needed.
1054 		inca   = bl1_vector_inc( trans,             m, n, a_rs, a_cs );
1055 		ldb    = 1; // multiplied by zero when n_iter == 1; not needed.
1056 		incb   = bl1_vector_inc( BLIS1_NO_TRANSPOSE, m, n, b_rs, b_cs );
1057 	}
1058 	else // matrix case
1059 	{
1060 		// Initialize with optimal values for column-major storage of B.
1061 		n_iter = n;
1062 		n_elem = m;
1063 		lda    = a_cs;
1064 		inca   = a_rs;
1065 		ldb    = b_cs;
1066 		incb   = b_rs;
1067 
1068 		// Handle the transposition of A.
1069 		if ( bl1_does_trans( trans ) )
1070 		{
1071 			bl1_swap_ints( lda, inca );
1072 		}
1073 
1074 		// An optimization: if B is row-major, then let's access the matrix by rows
1075 		// instead of by columns for increased spatial locality.
1076 		if ( bl1_is_row_storage( b_rs, b_cs ) )
1077 		{
1078 			bl1_swap_ints( n_iter, n_elem );
1079 			bl1_swap_ints( lda, inca );
1080 			bl1_swap_ints( ldb, incb );
1081 		}
1082 	}
1083 
1084 	// Extract conj component from trans parameter.
1085 	conj = bl1_proj_trans1_to_conj( trans );
1086 
1087 	for ( j = 0; j < n_iter; ++j )
1088 	{
1089 		a_begin = a + j*lda;
1090 		b_begin = b + j*ldb;
1091 
1092 		bl1_dzcopyv( conj,
1093 		             n_elem,
1094 		             a_begin, inca,
1095 		             b_begin, incb );
1096 	}
1097 }
bl1_zdcopymt(trans1_t trans,int m,int n,dcomplex * a,int a_rs,int a_cs,double * b,int b_rs,int b_cs)1098 void bl1_zdcopymt( trans1_t trans, int m, int n, dcomplex* a, int a_rs, int a_cs, double* b, int b_rs, int b_cs )
1099 {
1100 	dcomplex* a_begin;
1101 	double*   b_begin;
1102 	int       lda, inca;
1103 	int       ldb, incb;
1104 	int       n_iter;
1105 	int       n_elem;
1106 	int       j;
1107 	conj1_t    conj;
1108 
1109 	// Return early if possible.
1110 	if ( bl1_zero_dim2( m, n ) ) return;
1111 
1112 	// Handle cases where A and B are vectors to ensure that the underlying copy
1113 	// gets invoked only once.
1114 	if ( bl1_is_vector( m, n ) )
1115 	{
1116 		// Initialize with values appropriate for vectors.
1117 		n_iter = 1;
1118 		n_elem = bl1_vector_dim( m, n );
1119 		lda    = 1; // multiplied by zero when n_iter == 1; not needed.
1120 		inca   = bl1_vector_inc( trans,             m, n, a_rs, a_cs );
1121 		ldb    = 1; // multiplied by zero when n_iter == 1; not needed.
1122 		incb   = bl1_vector_inc( BLIS1_NO_TRANSPOSE, m, n, b_rs, b_cs );
1123 	}
1124 	else // matrix case
1125 	{
1126 		// Initialize with optimal values for column-major storage of B.
1127 		n_iter = n;
1128 		n_elem = m;
1129 		lda    = a_cs;
1130 		inca   = a_rs;
1131 		ldb    = b_cs;
1132 		incb   = b_rs;
1133 
1134 		// Handle the transposition of A.
1135 		if ( bl1_does_trans( trans ) )
1136 		{
1137 			bl1_swap_ints( lda, inca );
1138 		}
1139 
1140 		// An optimization: if B is row-major, then let's access the matrix by rows
1141 		// instead of by columns for increased spatial locality.
1142 		if ( bl1_is_row_storage( b_rs, b_cs ) )
1143 		{
1144 			bl1_swap_ints( n_iter, n_elem );
1145 			bl1_swap_ints( lda, inca );
1146 			bl1_swap_ints( ldb, incb );
1147 		}
1148 	}
1149 
1150 	// Extract conj component from trans parameter.
1151 	conj = bl1_proj_trans1_to_conj( trans );
1152 
1153 	for ( j = 0; j < n_iter; ++j )
1154 	{
1155 		a_begin = a + j*lda;
1156 		b_begin = b + j*ldb;
1157 
1158 		bl1_zdcopyv( conj,
1159 		             n_elem,
1160 		             a_begin, inca,
1161 		             b_begin, incb );
1162 	}
1163 }
1164 
1165 // cc
bl1_cccopymt(trans1_t trans,int m,int n,scomplex * a,int a_rs,int a_cs,scomplex * b,int b_rs,int b_cs)1166 void bl1_cccopymt( trans1_t trans, int m, int n, scomplex* a, int a_rs, int a_cs, scomplex* b, int b_rs, int b_cs )
1167 {
1168 	scomplex* a_begin;
1169 	scomplex* b_begin;
1170 	int       lda, inca;
1171 	int       ldb, incb;
1172 	int       n_iter;
1173 	int       n_elem;
1174 	int       j;
1175 	conj1_t    conj;
1176 
1177 	// Return early if possible.
1178 	if ( bl1_zero_dim2( m, n ) ) return;
1179 
1180 	// Handle cases where A and B are vectors to ensure that the underlying copy
1181 	// gets invoked only once.
1182 	if ( bl1_is_vector( m, n ) )
1183 	{
1184 		// Initialize with values appropriate for vectors.
1185 		n_iter = 1;
1186 		n_elem = bl1_vector_dim( m, n );
1187 		lda    = 1; // multiplied by zero when n_iter == 1; not needed.
1188 		inca   = bl1_vector_inc( trans,             m, n, a_rs, a_cs );
1189 		ldb    = 1; // multiplied by zero when n_iter == 1; not needed.
1190 		incb   = bl1_vector_inc( BLIS1_NO_TRANSPOSE, m, n, b_rs, b_cs );
1191 	}
1192 	else // matrix case
1193 	{
1194 		// Initialize with optimal values for column-major storage of B.
1195 		n_iter = n;
1196 		n_elem = m;
1197 		lda    = a_cs;
1198 		inca   = a_rs;
1199 		ldb    = b_cs;
1200 		incb   = b_rs;
1201 
1202 		// Handle the transposition of A.
1203 		if ( bl1_does_trans( trans ) )
1204 		{
1205 			bl1_swap_ints( lda, inca );
1206 		}
1207 
1208 		// An optimization: if B is row-major, then let's access the matrix by rows
1209 		// instead of by columns for increased spatial locality.
1210 		if ( bl1_is_row_storage( b_rs, b_cs ) )
1211 		{
1212 			bl1_swap_ints( n_iter, n_elem );
1213 			bl1_swap_ints( lda, inca );
1214 			bl1_swap_ints( ldb, incb );
1215 		}
1216 	}
1217 
1218 	// Extract conj component from trans parameter.
1219 	conj = bl1_proj_trans1_to_conj( trans );
1220 
1221 	for ( j = 0; j < n_iter; ++j )
1222 	{
1223 		a_begin = a + j*lda;
1224 		b_begin = b + j*ldb;
1225 
1226 		bl1_ccopyv( conj,
1227 		            n_elem,
1228 		            a_begin, inca,
1229 		            b_begin, incb );
1230 	}
1231 }
1232 
1233 // cz zc
bl1_czcopymt(trans1_t trans,int m,int n,scomplex * a,int a_rs,int a_cs,dcomplex * b,int b_rs,int b_cs)1234 void bl1_czcopymt( trans1_t trans, int m, int n, scomplex* a, int a_rs, int a_cs, dcomplex* b, int b_rs, int b_cs )
1235 {
1236 	scomplex* a_begin;
1237 	dcomplex* b_begin;
1238 	int       lda, inca;
1239 	int       ldb, incb;
1240 	int       n_iter;
1241 	int       n_elem;
1242 	int       j;
1243 	conj1_t    conj;
1244 
1245 	// Return early if possible.
1246 	if ( bl1_zero_dim2( m, n ) ) return;
1247 
1248 	// Handle cases where A and B are vectors to ensure that the underlying copy
1249 	// gets invoked only once.
1250 	if ( bl1_is_vector( m, n ) )
1251 	{
1252 		// Initialize with values appropriate for vectors.
1253 		n_iter = 1;
1254 		n_elem = bl1_vector_dim( m, n );
1255 		lda    = 1; // multiplied by zero when n_iter == 1; not needed.
1256 		inca   = bl1_vector_inc( trans,             m, n, a_rs, a_cs );
1257 		ldb    = 1; // multiplied by zero when n_iter == 1; not needed.
1258 		incb   = bl1_vector_inc( BLIS1_NO_TRANSPOSE, m, n, b_rs, b_cs );
1259 	}
1260 	else // matrix case
1261 	{
1262 		// Initialize with optimal values for column-major storage of B.
1263 		n_iter = n;
1264 		n_elem = m;
1265 		lda    = a_cs;
1266 		inca   = a_rs;
1267 		ldb    = b_cs;
1268 		incb   = b_rs;
1269 
1270 		// Handle the transposition of A.
1271 		if ( bl1_does_trans( trans ) )
1272 		{
1273 			bl1_swap_ints( lda, inca );
1274 		}
1275 
1276 		// An optimization: if B is row-major, then let's access the matrix by rows
1277 		// instead of by columns for increased spatial locality.
1278 		if ( bl1_is_row_storage( b_rs, b_cs ) )
1279 		{
1280 			bl1_swap_ints( n_iter, n_elem );
1281 			bl1_swap_ints( lda, inca );
1282 			bl1_swap_ints( ldb, incb );
1283 		}
1284 	}
1285 
1286 	// Extract conj component from trans parameter.
1287 	conj = bl1_proj_trans1_to_conj( trans );
1288 
1289 	for ( j = 0; j < n_iter; ++j )
1290 	{
1291 		a_begin = a + j*lda;
1292 		b_begin = b + j*ldb;
1293 
1294 		bl1_czcopyv( conj,
1295 		             n_elem,
1296 		             a_begin, inca,
1297 		             b_begin, incb );
1298 	}
1299 }
bl1_zccopymt(trans1_t trans,int m,int n,dcomplex * a,int a_rs,int a_cs,scomplex * b,int b_rs,int b_cs)1300 void bl1_zccopymt( trans1_t trans, int m, int n, dcomplex* a, int a_rs, int a_cs, scomplex* b, int b_rs, int b_cs )
1301 {
1302 	dcomplex* a_begin;
1303 	scomplex* b_begin;
1304 	int       lda, inca;
1305 	int       ldb, incb;
1306 	int       n_iter;
1307 	int       n_elem;
1308 	int       j;
1309 	conj1_t    conj;
1310 
1311 	// Return early if possible.
1312 	if ( bl1_zero_dim2( m, n ) ) return;
1313 
1314 	// Handle cases where A and B are vectors to ensure that the underlying copy
1315 	// gets invoked only once.
1316 	if ( bl1_is_vector( m, n ) )
1317 	{
1318 		// Initialize with values appropriate for vectors.
1319 		n_iter = 1;
1320 		n_elem = bl1_vector_dim( m, n );
1321 		lda    = 1; // multiplied by zero when n_iter == 1; not needed.
1322 		inca   = bl1_vector_inc( trans,             m, n, a_rs, a_cs );
1323 		ldb    = 1; // multiplied by zero when n_iter == 1; not needed.
1324 		incb   = bl1_vector_inc( BLIS1_NO_TRANSPOSE, m, n, b_rs, b_cs );
1325 	}
1326 	else // matrix case
1327 	{
1328 		// Initialize with optimal values for column-major storage of B.
1329 		n_iter = n;
1330 		n_elem = m;
1331 		lda    = a_cs;
1332 		inca   = a_rs;
1333 		ldb    = b_cs;
1334 		incb   = b_rs;
1335 
1336 		// Handle the transposition of A.
1337 		if ( bl1_does_trans( trans ) )
1338 		{
1339 			bl1_swap_ints( lda, inca );
1340 		}
1341 
1342 		// An optimization: if B is row-major, then let's access the matrix by rows
1343 		// instead of by columns for increased spatial locality.
1344 		if ( bl1_is_row_storage( b_rs, b_cs ) )
1345 		{
1346 			bl1_swap_ints( n_iter, n_elem );
1347 			bl1_swap_ints( lda, inca );
1348 			bl1_swap_ints( ldb, incb );
1349 		}
1350 	}
1351 
1352 	// Extract conj component from trans parameter.
1353 	conj = bl1_proj_trans1_to_conj( trans );
1354 
1355 	for ( j = 0; j < n_iter; ++j )
1356 	{
1357 		a_begin = a + j*lda;
1358 		b_begin = b + j*ldb;
1359 
1360 		bl1_zccopyv( conj,
1361 		             n_elem,
1362 		             a_begin, inca,
1363 		             b_begin, incb );
1364 	}
1365 }
1366 
1367 // zz
bl1_zzcopymt(trans1_t trans,int m,int n,dcomplex * a,int a_rs,int a_cs,dcomplex * b,int b_rs,int b_cs)1368 void bl1_zzcopymt( trans1_t trans, int m, int n, dcomplex* a, int a_rs, int a_cs, dcomplex* b, int b_rs, int b_cs )
1369 {
1370 	dcomplex* a_begin;
1371 	dcomplex* b_begin;
1372 	int       lda, inca;
1373 	int       ldb, incb;
1374 	int       n_iter;
1375 	int       n_elem;
1376 	int       j;
1377 	conj1_t    conj;
1378 
1379 	// Return early if possible.
1380 	if ( bl1_zero_dim2( m, n ) ) return;
1381 
1382 	// Handle cases where A and B are vectors to ensure that the underlying copy
1383 	// gets invoked only once.
1384 	if ( bl1_is_vector( m, n ) )
1385 	{
1386 		// Initialize with values appropriate for vectors.
1387 		n_iter = 1;
1388 		n_elem = bl1_vector_dim( m, n );
1389 		lda    = 1; // multiplied by zero when n_iter == 1; not needed.
1390 		inca   = bl1_vector_inc( trans,             m, n, a_rs, a_cs );
1391 		ldb    = 1; // multiplied by zero when n_iter == 1; not needed.
1392 		incb   = bl1_vector_inc( BLIS1_NO_TRANSPOSE, m, n, b_rs, b_cs );
1393 	}
1394 	else // matrix case
1395 	{
1396 		// Initialize with optimal values for column-major storage of B.
1397 		n_iter = n;
1398 		n_elem = m;
1399 		lda    = a_cs;
1400 		inca   = a_rs;
1401 		ldb    = b_cs;
1402 		incb   = b_rs;
1403 
1404 		// Handle the transposition of A.
1405 		if ( bl1_does_trans( trans ) )
1406 		{
1407 			bl1_swap_ints( lda, inca );
1408 		}
1409 
1410 		// An optimization: if B is row-major, then let's access the matrix by rows
1411 		// instead of by columns for increased spatial locality.
1412 		if ( bl1_is_row_storage( b_rs, b_cs ) )
1413 		{
1414 			bl1_swap_ints( n_iter, n_elem );
1415 			bl1_swap_ints( lda, inca );
1416 			bl1_swap_ints( ldb, incb );
1417 		}
1418 	}
1419 
1420 	// Extract conj component from trans parameter.
1421 	conj = bl1_proj_trans1_to_conj( trans );
1422 
1423 	for ( j = 0; j < n_iter; ++j )
1424 	{
1425 		a_begin = a + j*lda;
1426 		b_begin = b + j*ldb;
1427 
1428 		bl1_zcopyv( conj,
1429 		            n_elem,
1430 		            a_begin, inca,
1431 		            b_begin, incb );
1432 	}
1433 }
1434 
1435