1 /*
2 
3    BLIS
4    An object-based framework for developing high-performance BLAS-like
5    libraries.
6 
7    Copyright (C) 2014, The University of Texas at Austin
8 
9    Redistribution and use in source and binary forms, with or without
10    modification, are permitted provided that the following conditions are
11    met:
12     - Redistributions of source code must retain the above copyright
13       notice, this list of conditions and the following disclaimer.
14     - Redistributions in binary form must reproduce the above copyright
15       notice, this list of conditions and the following disclaimer in the
16       documentation and/or other materials provided with the distribution.
17     - Neither the name(s) of the copyright holder(s) nor the names of its
18       contributors may be used to endorse or promote products derived
19       from this software without specific prior written permission.
20 
21    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25    HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 
33 */
34 
35 #include "blis.h"
36 
37 #ifdef BLIS_ENABLE_GEMM_MD
38 
39 #define FUNCPTR_T packm_fp
40 
41 typedef void (*FUNCPTR_T)(
42                            trans_t transc,
43                            pack_t  schema,
44                            dim_t   m,
45                            dim_t   n,
46                            dim_t   m_max,
47                            dim_t   n_max,
48                            void*   kappa,
49                            void*   c, inc_t rs_c, inc_t cs_c,
50                            void*   p, inc_t rs_p, inc_t cs_p,
51                                       inc_t is_p,
52                                       dim_t pd_p, inc_t ps_p,
53                            cntx_t* cntx,
54                            thrinfo_t* thread
55                          );
56 
57 static FUNCPTR_T GENARRAY2_ALL(ftypes,packm_blk_var1_md);
58 
59 
bli_packm_blk_var1_md(obj_t * c,obj_t * p,cntx_t * cntx,cntl_t * cntl,thrinfo_t * t)60 void bli_packm_blk_var1_md
61      (
62        obj_t*   c,
63        obj_t*   p,
64        cntx_t*  cntx,
65        cntl_t*  cntl,
66        thrinfo_t* t
67      )
68 {
69 	num_t     dt_c       = bli_obj_dt( c );
70 	num_t     dt_p       = bli_obj_dt( p );
71 
72 	trans_t   transc     = bli_obj_conjtrans_status( c );
73 	pack_t    schema     = bli_obj_pack_schema( p );
74 
75 	dim_t     m_p        = bli_obj_length( p );
76 	dim_t     n_p        = bli_obj_width( p );
77 	dim_t     m_max_p    = bli_obj_padded_length( p );
78 	dim_t     n_max_p    = bli_obj_padded_width( p );
79 
80 	void*     buf_c      = bli_obj_buffer_at_off( c );
81 	inc_t     rs_c       = bli_obj_row_stride( c );
82 	inc_t     cs_c       = bli_obj_col_stride( c );
83 
84 	void*     buf_p      = bli_obj_buffer_at_off( p );
85 	inc_t     rs_p       = bli_obj_row_stride( p );
86 	inc_t     cs_p       = bli_obj_col_stride( p );
87 	inc_t     is_p       = bli_obj_imag_stride( p );
88 	dim_t     pd_p       = bli_obj_panel_dim( p );
89 	inc_t     ps_p       = bli_obj_panel_stride( p );
90 
91 	obj_t     kappa;
92 	void*     buf_kappa;
93 
94 	FUNCPTR_T f;
95 
96 
97 	// Treatment of kappa (ie: packing during scaling) depends on
98 	// whether we are executing an induced method.
99 	if ( bli_is_nat_packed( schema ) )
100 	{
101 		// This branch is for native execution, where we assume that
102 		// the micro-kernel will always apply the alpha scalar of the
103 		// higher-level operation. Thus, we use BLIS_ONE for kappa so
104 		// that the underlying packm implementation does not perform
105 		// any scaling during packing.
106 		buf_kappa = bli_obj_buffer_for_const( dt_p, &BLIS_ONE );
107 	}
108 	else // if ( bli_is_ind_packed( schema ) )
109 	{
110 		obj_t* kappa_p;
111 
112 		// The value for kappa we use will depend on whether the scalar
113 		// attached to A has a nonzero imaginary component. If it does,
114 		// then we will apply the scalar during packing to facilitate
115 		// implementing induced complex domain algorithms in terms of
116 		// real domain micro-kernels. (In the aforementioned situation,
117 		// applying a real scalar is easy, but applying a complex one is
118 		// harder, so we avoid the need altogether with the code below.)
119 		if ( bli_obj_scalar_has_nonzero_imag( p ) )
120 		{
121 			// Detach the scalar.
122 			bli_obj_scalar_detach( p, &kappa );
123 
124 			// Reset the attached scalar (to 1.0).
125 			bli_obj_scalar_reset( p );
126 
127 			kappa_p = κ
128 		}
129 		else
130 		{
131 			// If the internal scalar of A has only a real component, then
132 			// we will apply it later (in the micro-kernel), and so we will
133 			// use BLIS_ONE to indicate no scaling during packing.
134 			kappa_p = &BLIS_ONE;
135 		}
136 
137 		// Acquire the buffer to the kappa chosen above.
138 		buf_kappa = bli_obj_buffer_for_1x1( dt_p, kappa_p );
139 	}
140 
141 
142 	// Index into the type combination array to extract the correct
143 	// function pointer.
144 	f = ftypes[dt_c][dt_p];
145 
146 	// Invoke the function.
147 	f(
148 	   transc,
149 	   schema,
150 	   m_p,
151 	   n_p,
152 	   m_max_p,
153 	   n_max_p,
154 	   buf_kappa,
155 	   buf_c, rs_c, cs_c,
156 	   buf_p, rs_p, cs_p,
157 	          is_p,
158 	          pd_p, ps_p,
159 	   cntx,
160 	   t );
161 }
162 
163 
164 #undef  GENTFUNC2
165 #define GENTFUNC2( ctype_c, ctype_p, chc, chp, varname ) \
166 \
167 void PASTEMAC2(chc,chp,varname) \
168      ( \
169        trans_t transc, \
170        pack_t  schema, \
171        dim_t   m, \
172        dim_t   n, \
173        dim_t   m_max, \
174        dim_t   n_max, \
175        void*   kappa, \
176        void*   c, inc_t rs_c, inc_t cs_c, \
177        void*   p, inc_t rs_p, inc_t cs_p, \
178                   inc_t is_p, \
179                   dim_t pd_p, inc_t ps_p, \
180        cntx_t* cntx, \
181        thrinfo_t* thread  \
182      ) \
183 { \
184 	ctype_p* restrict kappa_cast = kappa; \
185 	ctype_c* restrict c_cast     = c; \
186 	ctype_p* restrict p_cast     = p; \
187 	ctype_c* restrict c_begin; \
188 	ctype_p* restrict p_begin; \
189 \
190 	dim_t             iter_dim; \
191 	dim_t             n_iter; \
192 	dim_t             it, ic, ip; \
193 	doff_t            ic_inc, ip_inc; \
194 	dim_t             panel_len_full; \
195 	dim_t             panel_len_i; \
196 	dim_t             panel_len_max; \
197 	dim_t             panel_len_max_i; \
198 	dim_t             panel_dim_i; \
199 	dim_t             panel_dim_max; \
200 	inc_t             vs_c; \
201 	inc_t             p_inc; \
202 	dim_t*            m_panel_use; \
203 	dim_t*            n_panel_use; \
204 	dim_t*            m_panel_max; \
205 	dim_t*            n_panel_max; \
206 	conj_t            conjc; \
207 	bool              row_stored; \
208 	bool              col_stored; \
209 \
210 	ctype_c* restrict c_use; \
211 	ctype_p* restrict p_use; \
212 \
213 \
214 	/* Extract the conjugation bit from the transposition argument. */ \
215 	conjc = bli_extract_conj( transc ); \
216 \
217 	/* If c needs a transposition, induce it so that we can more simply
218 	   express the remaining parameters and code. */ \
219 	if ( bli_does_trans( transc ) ) \
220 	{ \
221 		bli_swap_incs( &rs_c, &cs_c ); \
222 		bli_toggle_trans( &transc ); \
223 	} \
224 \
225 	/* Create flags to incidate row or column storage. Note that the
226 	   schema bit that encodes row or column is describing the form of
227 	   micro-panel, not the storage in the micro-panel. Hence the
228 	   mismatch in "row" and "column" semantics. */ \
229 	row_stored = bli_is_col_packed( schema ); \
230 	col_stored = bli_is_row_packed( schema ); \
231 \
232 	( void )col_stored; \
233 \
234 	/* If the row storage flag indicates row storage, then we are packing
235 	   to column panels; otherwise, if the strides indicate column storage,
236 	   we are packing to row panels. */ \
237 	if ( row_stored ) \
238 	{ \
239 		/* Prepare to pack to row-stored column panels. */ \
240 		iter_dim       = n; \
241 		panel_len_full = m; \
242 		panel_len_max  = m_max; \
243 		panel_dim_max  = pd_p; \
244 		vs_c           = cs_c; \
245 		m_panel_use    = &panel_len_i; \
246 		n_panel_use    = &panel_dim_i; \
247 		m_panel_max    = &panel_len_max_i; \
248 		n_panel_max    = &panel_dim_max; \
249 	} \
250 	else /* if ( col_stored ) */ \
251 	{ \
252 		/* Prepare to pack to column-stored row panels. */ \
253 		iter_dim       = m; \
254 		panel_len_full = n; \
255 		panel_len_max  = n_max; \
256 		panel_dim_max  = pd_p; \
257 		vs_c           = rs_c; \
258 		m_panel_use    = &panel_dim_i; \
259 		n_panel_use    = &panel_len_i; \
260 		m_panel_max    = &panel_dim_max; \
261 		n_panel_max    = &panel_len_max_i; \
262 	} \
263 \
264 	/* Compute the total number of iterations we'll need. */ \
265 	n_iter = iter_dim / panel_dim_max + ( iter_dim % panel_dim_max ? 1 : 0 ); \
266 \
267 	{ \
268 		ic_inc = panel_dim_max; \
269 		ip_inc = 1; \
270 	} \
271 \
272 	p_begin = p_cast; \
273 \
274 	/* Query the number of threads and thread ids from the current thread's
275 	   packm thrinfo_t node. */ \
276 	const dim_t nt  = bli_thread_n_way( thread ); \
277 	const dim_t tid = bli_thread_work_id( thread ); \
278 \
279 	/* Suppress unused variable warnings when slab partitioning is enabled,
280 	   since the slab-based definition of bli_packm_my_iter() does not
281 	   actually use tid or nt. */ \
282 	( void )nt; ( void )tid; \
283 \
284 	dim_t it_start, it_end, it_inc; \
285 \
286 	/* Determine the thread range and increment using the current thread's
287 	   packm thrinfo_t node. NOTE: The definition of bli_thread_range_jrir()
288 	   will depend on whether slab or round-robin partitioning was requested
289 	   at configure-time. */ \
290 	bli_thread_range_jrir( thread, n_iter, 1, FALSE, &it_start, &it_end, &it_inc ); \
291 \
292 	for ( ic  = 0,      ip  = 0,      it  = 0; it < n_iter; \
293 	      ic += ic_inc, ip += ip_inc, it += 1 ) \
294 	{ \
295 		panel_dim_i = bli_min( panel_dim_max, iter_dim - ic ); \
296 \
297 		c_begin     = c_cast + (ic  )*vs_c; \
298 \
299 		{ \
300 			c_use = c_begin; \
301 			p_use = p_begin; \
302 \
303 			panel_len_i     = panel_len_full; \
304 			panel_len_max_i = panel_len_max; \
305 \
306 			if ( bli_packm_my_iter( it, it_start, it_end, tid, nt ) ) \
307 			{ \
308 				PASTEMAC2(chc,chp,packm_struc_cxk_md) \
309 				( \
310 				  conjc, \
311 				  schema, \
312 				  *m_panel_use, \
313 				  *n_panel_use, \
314 				  *m_panel_max, \
315 				  *n_panel_max, \
316 				  kappa_cast, \
317 				  c_use, rs_c, cs_c, \
318 				  p_use, rs_p, cs_p, \
319 			             is_p, \
320 				  cntx \
321 				); \
322 			} \
323 \
324 			p_inc = ps_p; \
325 		} \
326 \
327 /*
328 if ( row_stored ) \
329 PASTEMAC(chp,fprintm)( stdout, "packm_blk_var1_md: b packed", *m_panel_max, *n_panel_max, \
330                                 p_use, rs_p, cs_p, "%5.2f", "" ); \
331 else \
332 PASTEMAC(chp,fprintm)( stdout, "packm_blk_var1_md: a packed", *m_panel_max, *n_panel_max, \
333                                 p_use, rs_p, cs_p, "%5.2f", "" ); \
334 */ \
335 \
336 		p_begin += p_inc; \
337 \
338 	} \
339 }
340 
341 INSERT_GENTFUNC2_BASIC0( packm_blk_var1_md )
342 INSERT_GENTFUNC2_MIXDP0( packm_blk_var1_md )
343 
344 #endif
345