1 /*
2 
3    BLIS
4    An object-based framework for developing high-performance BLAS-like
5    libraries.
6 
7    Copyright (C) 2014, The University of Texas at Austin
8 
9    Redistribution and use in source and binary forms, with or without
10    modification, are permitted provided that the following conditions are
11    met:
12     - Redistributions of source code must retain the above copyright
13       notice, this list of conditions and the following disclaimer.
14     - Redistributions in binary form must reproduce the above copyright
15       notice, this list of conditions and the following disclaimer in the
16       documentation and/or other materials provided with the distribution.
17     - Neither the name(s) of the copyright holder(s) nor the names of its
18       contributors may be used to endorse or promote products derived
19       from this software without specific prior written permission.
20 
21    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25    HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 
33 */
34 
35 #include "blis.h"
36 
37 #define FUNCPTR_T packm_fp
38 
39 typedef void (*FUNCPTR_T)(
40                            struc_t strucc,
41                            doff_t  diagoffc,
42                            diag_t  diagc,
43                            uplo_t  uploc,
44                            trans_t transc,
45                            dim_t   m,
46                            dim_t   n,
47                            dim_t   m_max,
48                            dim_t   n_max,
49                            void*   kappa,
50                            void*   c, inc_t rs_c, inc_t cs_c,
51                            void*   p, inc_t rs_p, inc_t cs_p,
52                            cntx_t* cntx
53                          );
54 
55 static FUNCPTR_T GENARRAY(ftypes,packm_unb_var1);
56 
57 
bli_packm_unb_var1(obj_t * c,obj_t * p,cntx_t * cntx,cntl_t * cntl,thrinfo_t * thread)58 void bli_packm_unb_var1
59      (
60        obj_t*  c,
61        obj_t*  p,
62        cntx_t* cntx,
63        cntl_t* cntl,
64        thrinfo_t* thread
65      )
66 {
67 	num_t     dt_cp     = bli_obj_dt( c );
68 
69 	struc_t   strucc    = bli_obj_struc( c );
70 	doff_t    diagoffc  = bli_obj_diag_offset( c );
71 	diag_t    diagc     = bli_obj_diag( c );
72 	uplo_t    uploc     = bli_obj_uplo( c );
73 	trans_t   transc    = bli_obj_conjtrans_status( c );
74 
75 	dim_t     m_p       = bli_obj_length( p );
76 	dim_t     n_p       = bli_obj_width( p );
77 	dim_t     m_max_p   = bli_obj_padded_length( p );
78 	dim_t     n_max_p   = bli_obj_padded_width( p );
79 
80 	void*     buf_c     = bli_obj_buffer_at_off( c );
81 	inc_t     rs_c      = bli_obj_row_stride( c );
82 	inc_t     cs_c      = bli_obj_col_stride( c );
83 
84 	void*     buf_p     = bli_obj_buffer_at_off( p );
85 	inc_t     rs_p      = bli_obj_row_stride( p );
86 	inc_t     cs_p      = bli_obj_col_stride( p );
87 
88 	void*     buf_kappa;
89 
90 	FUNCPTR_T f;
91 
92 
93 	// This variant assumes that the computational kernel will always apply
94 	// the alpha scalar of the higher-level operation. Thus, we use BLIS_ONE
95 	// for kappa so that the underlying packm implementation does not scale
96 	// during packing.
97 	buf_kappa = bli_obj_buffer_for_const( dt_cp, &BLIS_ONE );
98 
99 	// Index into the type combination array to extract the correct
100 	// function pointer.
101 	f = ftypes[dt_cp];
102 
103     if( bli_thread_am_ochief( thread ) ) {
104         // Invoke the function.
105         f
106 		(
107 		  strucc,
108           diagoffc,
109           diagc,
110           uploc,
111           transc,
112           m_p,
113           n_p,
114           m_max_p,
115           n_max_p,
116           buf_kappa,
117           buf_c, rs_c, cs_c,
118           buf_p, rs_p, cs_p,
119 		  cntx
120 		);
121     }
122 }
123 
124 
125 #undef  GENTFUNC
126 #define GENTFUNC( ctype, ch, varname ) \
127 \
128 void PASTEMAC(ch,varname) \
129      ( \
130        struc_t strucc, \
131        doff_t  diagoffc, \
132        diag_t  diagc, \
133        uplo_t  uploc, \
134        trans_t transc, \
135        dim_t   m, \
136        dim_t   n, \
137        dim_t   m_max, \
138        dim_t   n_max, \
139        void*   kappa, \
140        void*   c, inc_t rs_c, inc_t cs_c, \
141        void*   p, inc_t rs_p, inc_t cs_p, \
142        cntx_t* cntx  \
143      ) \
144 { \
145 	ctype* restrict kappa_cast = kappa; \
146 	ctype* restrict c_cast     = c; \
147 	ctype* restrict p_cast     = p; \
148 	ctype* restrict zero       = PASTEMAC(ch,0); \
149 \
150 	/* We begin by packing the region indicated by the parameters. If
151 	   matrix c is dense (either because the structure is general or
152 	   because the structure has already been "densified"), this ends
153 	   up being the only action we take. Note that if kappa is unit,
154 	   the data is simply copied (rather than scaled by one). */ \
155 	PASTEMAC2(ch,scal2m,BLIS_TAPI_EX_SUF) \
156 	( \
157 	  diagoffc, \
158 	  diagc, \
159 	  uploc, \
160 	  transc, \
161 	  m, \
162 	  n, \
163 	  kappa_cast, \
164 	  c_cast, rs_c, cs_c, \
165 	  p_cast, rs_p, cs_p, \
166 	  cntx, \
167 	  NULL  \
168 	); \
169 \
170 	/* If uploc is upper or lower, then the structure of c is necessarily
171 	   non-dense (ie: Hermitian, symmetric, or triangular, where part of the
172 	   matrix is unstored). In these cases, we want to fill in the unstored
173 	   part of the matrix. How this is done depends on the structure of c. */ \
174 	if ( bli_is_upper_or_lower( uploc ) ) \
175 	{ \
176 		/* The Hermitian and symmetric cases are almost identical, so we
177 		   handle them in one conditional block. */ \
178 		if ( bli_is_hermitian( strucc ) || bli_is_symmetric( strucc ) ) \
179 		{ \
180 			/* First we must reflect the region referenced to the opposite
181 			   side of the diagonal. */ \
182 			c_cast = c_cast + diagoffc * ( doff_t )cs_c + \
183 			                 -diagoffc * ( doff_t )rs_c; \
184 			bli_negate_diag_offset( &diagoffc ); \
185 			bli_toggle_trans( &transc ); \
186 			if      ( bli_is_upper( uploc ) ) diagoffc += 1; \
187 			else if ( bli_is_lower( uploc ) ) diagoffc -= 1; \
188 \
189 			/* If c is Hermitian, we need to apply a conjugation when
190 			   copying the region opposite the diagonal. */ \
191 			if ( bli_is_hermitian( strucc ) ) \
192 				transc = bli_trans_toggled_conj( transc ); \
193 \
194 			/* Copy the data from the region opposite the diagonal of c
195 			   (as specified by the original value of diagoffc). Notice
196 			   that we use a diag parameter of non-unit since we can
197 			   assume nothing about the neighboring off-diagonal. */ \
198 			PASTEMAC2(ch,scal2m,BLIS_TAPI_EX_SUF) \
199 			( \
200 			  diagoffc, \
201 			  BLIS_NONUNIT_DIAG, \
202 			  uploc, \
203 			  transc, \
204 			  m, \
205 			  n, \
206 			  kappa_cast, \
207 			  c_cast, rs_c, cs_c, \
208 			  p_cast, rs_p, cs_p, \
209 			  cntx, \
210 			  NULL  \
211 			); \
212 		} \
213 		else /* if ( bli_is_triangular( strucc ) ) */ \
214 		{ \
215 			doff_t diagoffp = diagoffc; \
216 			uplo_t uplop    = uploc; \
217 \
218 			/* For this step we need the uplo and diagonal offset of p, which
219 			   we can derive from the parameters given. */ \
220 			if ( bli_does_trans( transc ) ) \
221 			{ \
222 				bli_negate_diag_offset( &diagoffp ); \
223 				bli_toggle_uplo( &uplop ); \
224 			} \
225 \
226 			/* For triangular matrices, we wish to reference the region
227 			   strictly opposite the diagonal of C. This amounts to
228 			   toggling uploc and then shifting the diagonal offset to
229 			   shrink the stored region (by one diagonal). */ \
230 			bli_toggle_uplo( &uplop ); \
231 			bli_shift_diag_offset_to_shrink_uplo( uplop, &diagoffp ); \
232 \
233 			/* Set the region opposite the diagonal of p to zero. */ \
234 			PASTEMAC2(ch,setm,BLIS_TAPI_EX_SUF) \
235 			( \
236 			  BLIS_NO_CONJUGATE, \
237 			  diagoffp, \
238 			  BLIS_NONUNIT_DIAG, \
239 			  uplop, \
240 			  m, \
241 			  n, \
242 			  zero, \
243 			  p_cast, rs_p, cs_p, \
244 			  cntx, \
245 			  NULL  \
246 			); \
247 		} \
248 	} \
249 \
250 	/* The packed memory region was acquired/allocated with "aligned"
251 	   dimensions (ie: dimensions that were possibly inflated up to a
252 	   multiple). When these dimension are inflated, it creates empty
253 	   regions along the bottom and/or right edges of the matrix. If
254 	   eithe region exists, we set them to zero. This simplifies the
255 	   register level micro kernel in that it does not need to support
256 	   different register blockings for the edge cases. */ \
257 	if ( m != m_max ) \
258 	{ \
259 		ctype* p_edge = p_cast + (m  )*rs_p; \
260 \
261 		PASTEMAC2(ch,setm,BLIS_TAPI_EX_SUF) \
262 		( \
263 		  BLIS_NO_CONJUGATE, \
264 		  0, \
265 		  BLIS_NONUNIT_DIAG, \
266 		  BLIS_DENSE, \
267 		  m_max - m, \
268 		  n_max, \
269 		  zero, \
270 		  p_edge, rs_p, cs_p, \
271 		  cntx, \
272 		  NULL  \
273 		); \
274 	} \
275 \
276 	if ( n != n_max ) \
277 	{ \
278 		ctype* p_edge = p_cast + (n  )*cs_p; \
279 \
280 		PASTEMAC2(ch,setm,BLIS_TAPI_EX_SUF) \
281 		( \
282 		  BLIS_NO_CONJUGATE, \
283 		  0, \
284 		  BLIS_NONUNIT_DIAG, \
285 		  BLIS_DENSE, \
286 		  m_max, \
287 		  n_max - n, \
288 		  zero, \
289 		  p_edge, rs_p, cs_p, \
290 		  cntx, \
291 		  NULL  \
292 		); \
293 	} \
294 }
295 
296 INSERT_GENTFUNC_BASIC0( packm_unb_var1 )
297 
298