1 /*
2 
3    BLIS
4    An object-based framework for developing high-performance BLAS-like
5    libraries.
6 
7    Copyright (C) 2014, The University of Texas at Austin
8 
9    Redistribution and use in source and binary forms, with or without
10    modification, are permitted provided that the following conditions are
11    met:
12     - Redistributions of source code must retain the above copyright
13       notice, this list of conditions and the following disclaimer.
14     - Redistributions in binary form must reproduce the above copyright
15       notice, this list of conditions and the following disclaimer in the
16       documentation and/or other materials provided with the distribution.
17     - Neither the name(s) of the copyright holder(s) nor the names of its
18       contributors may be used to endorse or promote products derived
19       from this software without specific prior written permission.
20 
21    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25    HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 
33 */
34 
35 #include "blis.h"
36 
37 
38 #undef  GENTFUNCCO
39 #define GENTFUNCCO( ctype, ctype_r, ch, chr, opname, arch, suf, diagop ) \
40 \
41 void PASTEMAC3(ch,opname,arch,suf) \
42      ( \
43        ctype*     restrict a, \
44        ctype*     restrict b, \
45        ctype*     restrict c, inc_t rs_c, inc_t cs_c, \
46        auxinfo_t* restrict data, \
47        cntx_t*    restrict cntx  \
48      ) \
49 { \
50 	const num_t       dt     = PASTEMAC(ch,type); \
51 \
52 	const dim_t       mr     = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \
53 	const dim_t       nr     = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \
54 \
55 	const inc_t       packmr = bli_cntx_get_blksz_max_dt( dt, BLIS_MR, cntx ); \
56 	const inc_t       packnr = bli_cntx_get_blksz_max_dt( dt, BLIS_NR, cntx ); \
57 \
58 	const dim_t       m      = mr; \
59 	const dim_t       n      = nr; \
60 \
61 	const inc_t       rs_a  = 1; \
62 	const inc_t       cs_a  = packmr; \
63 \
64 	const inc_t       rs_b  = packnr; \
65 	const inc_t       cs_b  = 1; \
66 \
67 	const inc_t       ld_a  = cs_a; \
68 	const inc_t       ld_b  = rs_b; \
69 \
70 	const pack_t      schema_b = bli_cntx_schema_b_panel( cntx ); \
71 \
72 	dim_t             iter, i, j, l; \
73 	dim_t             n_behind; \
74 \
75 \
76 	if ( bli_is_1e_packed( schema_b ) ) \
77 	{ \
78 		const inc_t       rs_a2 = 1 * rs_a; \
79 		const inc_t       cs_a2 = 2 * cs_a; \
80 \
81 		ctype_r* restrict a_r   = ( ctype_r* )a; \
82 		ctype_r* restrict a_i   = ( ctype_r* )a + ld_a; \
83 \
84 		ctype*   restrict b_ri  = ( ctype*   )b; \
85 		ctype*   restrict b_ir  = ( ctype*   )b + ld_b/2; \
86 \
87 		for ( iter = 0; iter < m; ++iter ) \
88 		{ \
89 			i         = iter; \
90 			n_behind  = i; \
91 \
92 			ctype_r* restrict alpha11_r  = a_r  + (i  )*rs_a2 + (i  )*cs_a2; \
93 			ctype_r* restrict alpha11_i  = a_i  + (i  )*rs_a2 + (i  )*cs_a2; \
94 			ctype_r* restrict a10t_r     = a_r  + (i  )*rs_a2 + (0  )*cs_a2; \
95 			ctype_r* restrict a10t_i     = a_i  + (i  )*rs_a2 + (0  )*cs_a2; \
96 			ctype*   restrict b1_ri      = b_ri + (i  )*rs_b  + (0  )*cs_b; \
97 			ctype*   restrict b1_ir      = b_ir + (i  )*rs_b  + (0  )*cs_b; \
98 			ctype*   restrict B0_ri      = b_ri + (0  )*rs_b  + (0  )*cs_b; \
99 \
100 			/* b1 = b1 - a10t * B0; */ \
101 			/* b1 = b1 / alpha11; */ \
102 			for ( j = 0; j < n; ++j ) \
103 			{ \
104 				ctype*   restrict beta11_ri = b1_ri + (0  )*rs_b + (j  )*cs_b; \
105 				ctype*   restrict beta11_ir = b1_ir + (0  )*rs_b + (j  )*cs_b; \
106 				ctype*   restrict b01_ri    = B0_ri + (0  )*rs_b + (j  )*cs_b; \
107 				ctype*   restrict gamma11   = c     + (i  )*rs_c + (j  )*cs_c; \
108 				ctype_r           beta11c_r = PASTEMAC(ch,real)( *beta11_ri ); \
109 				ctype_r           beta11c_i = PASTEMAC(ch,imag)( *beta11_ri ); \
110 				ctype_r           rho11_r; \
111 				ctype_r           rho11_i; \
112 \
113 				/* beta11 = beta11 - a10t * b01; */ \
114 				PASTEMAC(ch,set0ris)( rho11_r, \
115 				                      rho11_i ); \
116 				for ( l = 0; l < n_behind; ++l ) \
117 				{ \
118 					ctype_r* restrict alpha10_r = a10t_r  + (l  )*cs_a2; \
119 					ctype_r* restrict alpha10_i = a10t_i  + (l  )*cs_a2; \
120 					ctype*   restrict beta01_ri = b01_ri  + (l  )*rs_b; \
121 					ctype_r* restrict beta01_r  = &PASTEMAC(ch,real)( *beta01_ri ); \
122 					ctype_r* restrict beta01_i  = &PASTEMAC(ch,imag)( *beta01_ri ); \
123 \
124 					PASTEMAC(ch,axpyris)( *alpha10_r, \
125 					                      *alpha10_i, \
126 					                      *beta01_r, \
127 					                      *beta01_i, \
128 					                      rho11_r, \
129 					                      rho11_i ); \
130 				} \
131 				PASTEMAC(ch,subris)( rho11_r, \
132 				                     rho11_i, \
133 				                     beta11c_r, \
134 				                     beta11c_i ); \
135 \
136 				/* beta11 = beta11 / alpha11; */ \
137 				/* NOTE: When preinversion is enabled, the INVERSE of alpha11
138 				   (1.0/alpha11) is stored during packing instead alpha11 so we
139 				   can multiply rather than divide. When preinversion is disabled,
140 				   alpha11 is stored and division happens below explicitly. */ \
141 				PASTEMAC(ch,diagop)( *alpha11_r, \
142 				                     *alpha11_i, \
143 				                     beta11c_r, \
144 				                     beta11c_i ); \
145 \
146 				/* Output final result to matrix c. */ \
147 				PASTEMAC(ch,sets)(  beta11c_r, beta11c_i, *gamma11 ); \
148 \
149 				/* Store the local values back to b11. */ \
150 				PASTEMAC(ch,sets)(  beta11c_r, beta11c_i, *beta11_ri ); \
151 				PASTEMAC(ch,sets)( -beta11c_i, beta11c_r, *beta11_ir ); \
152 			} \
153 		} \
154 	} \
155 	else /* ( bli_is_1r_packed( schema_b ) ) */ \
156 	{ \
157 		const inc_t       rs_b2 = 2 * rs_b; \
158 		const inc_t       cs_b2 = 1 * cs_b; \
159 \
160 		ctype*   restrict a_ri  = ( ctype*   )a; \
161 		/*ctype*   restrict a_ir  = ( ctype*   )a + ld_a/2;*/ \
162 \
163 		ctype_r* restrict b_r   = ( ctype_r* )b; \
164 		ctype_r* restrict b_i   = ( ctype_r* )b + ld_b; \
165 \
166 		for ( iter = 0; iter < m; ++iter ) \
167 		{ \
168 			i         = iter; \
169 			n_behind  = i; \
170 \
171 			ctype*   restrict alpha11_ri = a_ri + (i  )*rs_a  + (i  )*cs_a; \
172 			ctype_r* restrict alpha11_r  = &PASTEMAC(ch,real)( *alpha11_ri ); \
173 			ctype_r* restrict alpha11_i  = &PASTEMAC(ch,imag)( *alpha11_ri ); \
174 			ctype*   restrict a10t_ri    = a_ri + (i  )*rs_a  + (0  )*cs_a; \
175 			ctype_r* restrict b1_r       = b_r  + (i  )*rs_b2 + (0  )*cs_b2; \
176 			ctype_r* restrict b1_i       = b_i  + (i  )*rs_b2 + (0  )*cs_b2; \
177 			ctype_r* restrict B0_r       = b_r  + (0  )*rs_b2 + (0  )*cs_b2; \
178 			ctype_r* restrict B0_i       = b_i  + (0  )*rs_b2 + (0  )*cs_b2; \
179 \
180 			/* b1 = b1 - a10t * B0; */ \
181 			/* b1 = b1 / alpha11; */ \
182 			for ( j = 0; j < n; ++j ) \
183 			{ \
184 				ctype_r* restrict beta11_r  = b1_r + (0  )*rs_b2 + (j  )*cs_b2; \
185 				ctype_r* restrict beta11_i  = b1_i + (0  )*rs_b2 + (j  )*cs_b2; \
186 				ctype_r* restrict b01_r     = B0_r + (0  )*rs_b2 + (j  )*cs_b2; \
187 				ctype_r* restrict b01_i     = B0_i + (0  )*rs_b2 + (j  )*cs_b2; \
188 				ctype*   restrict gamma11   = c    + (i  )*rs_c  + (j  )*cs_c; \
189 				ctype_r           beta11c_r = *beta11_r; \
190 				ctype_r           beta11c_i = *beta11_i; \
191 				ctype_r           rho11_r; \
192 				ctype_r           rho11_i; \
193 \
194 				/* beta11 = beta11 - a10t * b01; */ \
195 				PASTEMAC(ch,set0ris)( rho11_r, \
196 				                      rho11_i ); \
197 				for ( l = 0; l < n_behind; ++l ) \
198 				{ \
199 					ctype*   restrict alpha10_ri = a10t_ri + (l  )*cs_a; \
200 					ctype_r* restrict alpha10_r  = &PASTEMAC(ch,real)( *alpha10_ri ); \
201 					ctype_r* restrict alpha10_i  = &PASTEMAC(ch,imag)( *alpha10_ri ); \
202 					ctype_r* restrict beta01_r   = b01_r   + (l  )*rs_b2; \
203 					ctype_r* restrict beta01_i   = b01_i   + (l  )*rs_b2; \
204 \
205 					PASTEMAC(ch,axpyris)( *alpha10_r, \
206 					                      *alpha10_i, \
207 					                      *beta01_r, \
208 					                      *beta01_i, \
209 					                      rho11_r, \
210 					                      rho11_i ); \
211 				} \
212 				PASTEMAC(ch,subris)( rho11_r, \
213 				                     rho11_i, \
214 				                     beta11c_r, \
215 				                     beta11c_i ); \
216 \
217 				/* beta11 = beta11 / alpha11; */ \
218 				/* NOTE: When preinversion is enabled, the INVERSE of alpha11
219 				   (1.0/alpha11) is stored during packing instead alpha11 so we
220 				   can multiply rather than divide. When preinversion is disabled,
221 				   alpha11 is stored and division happens below explicitly. */ \
222 				PASTEMAC(ch,diagop)( *alpha11_r, \
223 				                     *alpha11_i, \
224 				                     beta11c_r, \
225 				                     beta11c_i ); \
226 \
227 				/* Output final result to matrix c. */ \
228 				PASTEMAC(ch,sets)( beta11c_r, \
229 				                   beta11c_i, *gamma11 ); \
230 \
231 				/* Store the local values back to b11. */ \
232 				PASTEMAC(ch,copyris)( beta11c_r, \
233 				                      beta11c_i, \
234 				                      *beta11_r, \
235 				                      *beta11_i ); \
236 			} \
237 		} \
238 	} \
239 }
240 
241 #ifdef BLIS_ENABLE_TRSM_PREINVERSION
242 INSERT_GENTFUNCCO_BASIC3( trsm1m_l, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, scalris )
243 #else
244 INSERT_GENTFUNCCO_BASIC3( trsm1m_l, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, invscalris )
245 #endif
246 
247 
248 #undef  GENTFUNCCO
249 #define GENTFUNCCO( ctype, ctype_r, ch, chr, opname, arch, suf, diagop ) \
250 \
251 void PASTEMAC3(ch,opname,arch,suf) \
252      ( \
253        ctype*     restrict a, \
254        ctype*     restrict b, \
255        ctype*     restrict c, inc_t rs_c, inc_t cs_c, \
256        auxinfo_t* restrict data, \
257        cntx_t*    restrict cntx  \
258      ) \
259 { \
260 	const num_t       dt     = PASTEMAC(ch,type); \
261 \
262 	const dim_t       mr     = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \
263 	const dim_t       nr     = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \
264 \
265 	const inc_t       packmr = bli_cntx_get_blksz_max_dt( dt, BLIS_MR, cntx ); \
266 	const inc_t       packnr = bli_cntx_get_blksz_max_dt( dt, BLIS_NR, cntx ); \
267 \
268 	const dim_t       m      = mr; \
269 	const dim_t       n      = nr; \
270 \
271 	const inc_t       rs_a  = 1; \
272 	const inc_t       cs_a  = packmr; \
273 \
274 	const inc_t       rs_b  = packnr; \
275 	const inc_t       cs_b  = 1; \
276 \
277 	const inc_t       ld_a  = cs_a; \
278 	const inc_t       ld_b  = rs_b; \
279 \
280 	const pack_t      schema_b = bli_cntx_schema_b_panel( cntx ); \
281 \
282 	dim_t             iter, i, j, l; \
283 	dim_t             n_behind; \
284 \
285 \
286 	if ( bli_is_1e_packed( schema_b ) ) \
287 	{ \
288 		const inc_t       rs_a2 = 1 * rs_a; \
289 		const inc_t       cs_a2 = 2 * cs_a; \
290 \
291 		ctype_r* restrict a_r   = ( ctype_r* )a; \
292 		ctype_r* restrict a_i   = ( ctype_r* )a + ld_a; \
293 \
294 		ctype*   restrict b_ri  = ( ctype*   )b; \
295 		ctype*   restrict b_ir  = ( ctype*   )b + ld_b/2; \
296 \
297 		for ( iter = 0; iter < m; ++iter ) \
298 		{ \
299 			i         = m - iter - 1; \
300 			n_behind  = iter; \
301 \
302 			ctype_r* restrict alpha11_r  = a_r  + (i  )*rs_a2 + (i  )*cs_a2; \
303 			ctype_r* restrict alpha11_i  = a_i  + (i  )*rs_a2 + (i  )*cs_a2; \
304 			ctype_r* restrict a12t_r     = a_r  + (i  )*rs_a2 + (i+1)*cs_a2; \
305 			ctype_r* restrict a12t_i     = a_i  + (i  )*rs_a2 + (i+1)*cs_a2; \
306 			ctype*   restrict b1_ri      = b_ri + (i  )*rs_b  + (0  )*cs_b; \
307 			ctype*   restrict b1_ir      = b_ir + (i  )*rs_b  + (0  )*cs_b; \
308 			ctype*   restrict B2_ri      = b_ri + (i+1)*rs_b  + (0  )*cs_b; \
309 \
310 			/* b1 = b1 - a12t * B2; */ \
311 			/* b1 = b1 / alpha11; */ \
312 			for ( j = 0; j < n; ++j ) \
313 			{ \
314 				ctype*   restrict beta11_ri = b1_ri + (0  )*rs_b + (j  )*cs_b; \
315 				ctype*   restrict beta11_ir = b1_ir + (0  )*rs_b + (j  )*cs_b; \
316 				ctype*   restrict b21_ri    = B2_ri + (0  )*rs_b + (j  )*cs_b; \
317 				ctype*   restrict gamma11   = c     + (i  )*rs_c + (j  )*cs_c; \
318 				ctype_r           beta11c_r = PASTEMAC(ch,real)( *beta11_ri ); \
319 				ctype_r           beta11c_i = PASTEMAC(ch,imag)( *beta11_ri ); \
320 				ctype_r           rho11_r; \
321 				ctype_r           rho11_i; \
322 \
323 				/* beta11 = beta11 - a10t * b01; */ \
324 				PASTEMAC(ch,set0ris)( rho11_r, \
325 				                      rho11_i ); \
326 				for ( l = 0; l < n_behind; ++l ) \
327 				{ \
328 					ctype_r* restrict alpha12_r = a12t_r  + (l  )*cs_a2; \
329 					ctype_r* restrict alpha12_i = a12t_i  + (l  )*cs_a2; \
330 					ctype*   restrict beta21_ri = b21_ri  + (l  )*rs_b; \
331 					ctype_r* restrict beta21_r  = &PASTEMAC(ch,real)( *beta21_ri ); \
332 					ctype_r* restrict beta21_i  = &PASTEMAC(ch,imag)( *beta21_ri ); \
333 \
334 					PASTEMAC(ch,axpyris)( *alpha12_r, \
335 					                      *alpha12_i, \
336 					                      *beta21_r, \
337 					                      *beta21_i, \
338 					                      rho11_r, \
339 					                      rho11_i ); \
340 				} \
341 				PASTEMAC(ch,subris)( rho11_r, \
342 				                     rho11_i, \
343 				                     beta11c_r, \
344 				                     beta11c_i ); \
345 \
346 				/* beta11 = beta11 / alpha11; */ \
347 				/* NOTE: When preinversion is enabled, the INVERSE of alpha11
348 				   (1.0/alpha11) is stored during packing instead alpha11 so we
349 				   can multiply rather than divide. When preinversion is disabled,
350 				   alpha11 is stored and division happens below explicitly. */ \
351 				PASTEMAC(ch,diagop)( *alpha11_r, \
352 				                     *alpha11_i, \
353 				                     beta11c_r, \
354 				                     beta11c_i ); \
355 \
356 				/* Output final result to matrix c. */ \
357 				PASTEMAC(ch,sets)(  beta11c_r, beta11c_i, *gamma11 ); \
358 \
359 				/* Store the local values back to b11. */ \
360 				PASTEMAC(ch,sets)(  beta11c_r, beta11c_i, *beta11_ri ); \
361 				PASTEMAC(ch,sets)( -beta11c_i, beta11c_r, *beta11_ir ); \
362 			} \
363 		} \
364 	} \
365 	else /* if ( bli_is_1r_packed( schema_b ) ) */ \
366 	{ \
367 		const inc_t       rs_b2 = 2 * rs_b; \
368 		const inc_t       cs_b2 = 1 * cs_b; \
369 \
370 		ctype*   restrict a_ri  = ( ctype*   )a; \
371 		/*ctype*   restrict a_ir  = ( ctype*   )a + ld_a/2;*/ \
372 \
373 		ctype_r* restrict b_r   = ( ctype_r* )b; \
374 		ctype_r* restrict b_i   = ( ctype_r* )b + ld_b; \
375 \
376 		for ( iter = 0; iter < m; ++iter ) \
377 		{ \
378 			i         = m - iter - 1; \
379 			n_behind  = iter; \
380 \
381 			ctype*   restrict alpha11_ri = a_ri + (i  )*rs_a  + (i  )*cs_a; \
382 			ctype_r* restrict alpha11_r  = &PASTEMAC(ch,real)( *alpha11_ri ); \
383 			ctype_r* restrict alpha11_i  = &PASTEMAC(ch,imag)( *alpha11_ri ); \
384 			ctype*   restrict a12t_ri    = a_ri + (i  )*rs_a  + (i+1)*cs_a; \
385 			ctype_r* restrict b1_r       = b_r  + (i  )*rs_b2 + (0  )*cs_b2; \
386 			ctype_r* restrict b1_i       = b_i  + (i  )*rs_b2 + (0  )*cs_b2; \
387 			ctype_r* restrict B2_r       = b_r  + (i+1)*rs_b2 + (0  )*cs_b2; \
388 			ctype_r* restrict B2_i       = b_i  + (i+1)*rs_b2 + (0  )*cs_b2; \
389 \
390 			/* b1 = b1 - a12t * B2; */ \
391 			/* b1 = b1 / alpha11; */ \
392 			for ( j = 0; j < n; ++j ) \
393 			{ \
394 				ctype_r* restrict beta11_r  = b1_r + (0  )*rs_b2 + (j  )*cs_b2; \
395 				ctype_r* restrict beta11_i  = b1_i + (0  )*rs_b2 + (j  )*cs_b2; \
396 				ctype_r* restrict b21_r     = B2_r + (0  )*rs_b2 + (j  )*cs_b2; \
397 				ctype_r* restrict b21_i     = B2_i + (0  )*rs_b2 + (j  )*cs_b2; \
398 				ctype*   restrict gamma11   = c    + (i  )*rs_c  + (j  )*cs_c; \
399 				ctype_r           beta11c_r = *beta11_r; \
400 				ctype_r           beta11c_i = *beta11_i; \
401 				ctype_r           rho11_r; \
402 				ctype_r           rho11_i; \
403 \
404 				/* beta11 = beta11 - a10t * b01; */ \
405 				PASTEMAC(ch,set0ris)( rho11_r, \
406 				                      rho11_i ); \
407 				for ( l = 0; l < n_behind; ++l ) \
408 				{ \
409 					ctype*   restrict alpha12_ri = a12t_ri + (l  )*cs_a; \
410 					ctype_r* restrict alpha12_r  = &PASTEMAC(ch,real)( *alpha12_ri ); \
411 					ctype_r* restrict alpha12_i  = &PASTEMAC(ch,imag)( *alpha12_ri ); \
412 					ctype_r* restrict beta21_r   = b21_r   + (l  )*rs_b2; \
413 					ctype_r* restrict beta21_i   = b21_i   + (l  )*rs_b2; \
414 \
415 					PASTEMAC(ch,axpyris)( *alpha12_r, \
416 					                      *alpha12_i, \
417 					                      *beta21_r, \
418 					                      *beta21_i, \
419 					                      rho11_r, \
420 					                      rho11_i ); \
421 				} \
422 				PASTEMAC(ch,subris)( rho11_r, \
423 				                     rho11_i, \
424 				                     beta11c_r, \
425 				                     beta11c_i ); \
426 \
427 				/* beta11 = beta11 / alpha11; */ \
428 				/* NOTE: When preinversion is enabled, the INVERSE of alpha11
429 				   (1.0/alpha11) is stored during packing instead alpha11 so we
430 				   can multiply rather than divide. When preinversion is disabled,
431 				   alpha11 is stored and division happens below explicitly. */ \
432 				PASTEMAC(ch,diagop)( *alpha11_r, \
433 				                     *alpha11_i, \
434 				                     beta11c_r, \
435 				                     beta11c_i ); \
436 \
437 				/* Output final result to matrix c. */ \
438 				PASTEMAC(ch,sets)( beta11c_r, \
439 				                   beta11c_i, *gamma11 ); \
440 \
441 				/* Store the local values back to b11. */ \
442 				PASTEMAC(ch,copyris)( beta11c_r, \
443 				                      beta11c_i, \
444 				                      *beta11_r, \
445 				                      *beta11_i ); \
446 			} \
447 		} \
448 	} \
449 }
450 
451 #ifdef BLIS_ENABLE_TRSM_PREINVERSION
452 INSERT_GENTFUNCCO_BASIC3( trsm1m_u, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, scalris )
453 #else
454 INSERT_GENTFUNCCO_BASIC3( trsm1m_u, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, invscalris )
455 #endif
456