1 /*
2
3 BLIS
4 An object-based framework for developing high-performance BLAS-like
5 libraries.
6
7 Copyright (C) 2014, The University of Texas at Austin
8
9 Redistribution and use in source and binary forms, with or without
10 modification, are permitted provided that the following conditions are
11 met:
12 - Redistributions of source code must retain the above copyright
13 notice, this list of conditions and the following disclaimer.
14 - Redistributions in binary form must reproduce the above copyright
15 notice, this list of conditions and the following disclaimer in the
16 documentation and/or other materials provided with the distribution.
17 - Neither the name(s) of the copyright holder(s) nor the names of its
18 contributors may be used to endorse or promote products derived
19 from this software without specific prior written permission.
20
21 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32
33 */
34
35 #include "blis.h"
36
37 #ifdef BLIS_ENABLE_GEMM_MD
38
39 #define FUNCPTR_T packm_fp
40
41 typedef void (*FUNCPTR_T)(
42 trans_t transc,
43 pack_t schema,
44 dim_t m,
45 dim_t n,
46 dim_t m_max,
47 dim_t n_max,
48 void* kappa,
49 void* c, inc_t rs_c, inc_t cs_c,
50 void* p, inc_t rs_p, inc_t cs_p,
51 inc_t is_p,
52 dim_t pd_p, inc_t ps_p,
53 cntx_t* cntx,
54 thrinfo_t* thread
55 );
56
57 static FUNCPTR_T GENARRAY2_ALL(ftypes,packm_blk_var1_md);
58
59
bli_packm_blk_var1_md(obj_t * c,obj_t * p,cntx_t * cntx,cntl_t * cntl,thrinfo_t * t)60 void bli_packm_blk_var1_md
61 (
62 obj_t* c,
63 obj_t* p,
64 cntx_t* cntx,
65 cntl_t* cntl,
66 thrinfo_t* t
67 )
68 {
69 num_t dt_c = bli_obj_dt( c );
70 num_t dt_p = bli_obj_dt( p );
71
72 trans_t transc = bli_obj_conjtrans_status( c );
73 pack_t schema = bli_obj_pack_schema( p );
74
75 dim_t m_p = bli_obj_length( p );
76 dim_t n_p = bli_obj_width( p );
77 dim_t m_max_p = bli_obj_padded_length( p );
78 dim_t n_max_p = bli_obj_padded_width( p );
79
80 void* buf_c = bli_obj_buffer_at_off( c );
81 inc_t rs_c = bli_obj_row_stride( c );
82 inc_t cs_c = bli_obj_col_stride( c );
83
84 void* buf_p = bli_obj_buffer_at_off( p );
85 inc_t rs_p = bli_obj_row_stride( p );
86 inc_t cs_p = bli_obj_col_stride( p );
87 inc_t is_p = bli_obj_imag_stride( p );
88 dim_t pd_p = bli_obj_panel_dim( p );
89 inc_t ps_p = bli_obj_panel_stride( p );
90
91 obj_t kappa;
92 void* buf_kappa;
93
94 FUNCPTR_T f;
95
96
97 // Treatment of kappa (ie: packing during scaling) depends on
98 // whether we are executing an induced method.
99 if ( bli_is_nat_packed( schema ) )
100 {
101 // This branch is for native execution, where we assume that
102 // the micro-kernel will always apply the alpha scalar of the
103 // higher-level operation. Thus, we use BLIS_ONE for kappa so
104 // that the underlying packm implementation does not perform
105 // any scaling during packing.
106 buf_kappa = bli_obj_buffer_for_const( dt_p, &BLIS_ONE );
107 }
108 else // if ( bli_is_ind_packed( schema ) )
109 {
110 obj_t* kappa_p;
111
112 // The value for kappa we use will depend on whether the scalar
113 // attached to A has a nonzero imaginary component. If it does,
114 // then we will apply the scalar during packing to facilitate
115 // implementing induced complex domain algorithms in terms of
116 // real domain micro-kernels. (In the aforementioned situation,
117 // applying a real scalar is easy, but applying a complex one is
118 // harder, so we avoid the need altogether with the code below.)
119 if ( bli_obj_scalar_has_nonzero_imag( p ) )
120 {
121 // Detach the scalar.
122 bli_obj_scalar_detach( p, &kappa );
123
124 // Reset the attached scalar (to 1.0).
125 bli_obj_scalar_reset( p );
126
127 kappa_p = κ
128 }
129 else
130 {
131 // If the internal scalar of A has only a real component, then
132 // we will apply it later (in the micro-kernel), and so we will
133 // use BLIS_ONE to indicate no scaling during packing.
134 kappa_p = &BLIS_ONE;
135 }
136
137 // Acquire the buffer to the kappa chosen above.
138 buf_kappa = bli_obj_buffer_for_1x1( dt_p, kappa_p );
139 }
140
141
142 // Index into the type combination array to extract the correct
143 // function pointer.
144 f = ftypes[dt_c][dt_p];
145
146 // Invoke the function.
147 f(
148 transc,
149 schema,
150 m_p,
151 n_p,
152 m_max_p,
153 n_max_p,
154 buf_kappa,
155 buf_c, rs_c, cs_c,
156 buf_p, rs_p, cs_p,
157 is_p,
158 pd_p, ps_p,
159 cntx,
160 t );
161 }
162
163
164 #undef GENTFUNC2
165 #define GENTFUNC2( ctype_c, ctype_p, chc, chp, varname ) \
166 \
167 void PASTEMAC2(chc,chp,varname) \
168 ( \
169 trans_t transc, \
170 pack_t schema, \
171 dim_t m, \
172 dim_t n, \
173 dim_t m_max, \
174 dim_t n_max, \
175 void* kappa, \
176 void* c, inc_t rs_c, inc_t cs_c, \
177 void* p, inc_t rs_p, inc_t cs_p, \
178 inc_t is_p, \
179 dim_t pd_p, inc_t ps_p, \
180 cntx_t* cntx, \
181 thrinfo_t* thread \
182 ) \
183 { \
184 ctype_p* restrict kappa_cast = kappa; \
185 ctype_c* restrict c_cast = c; \
186 ctype_p* restrict p_cast = p; \
187 ctype_c* restrict c_begin; \
188 ctype_p* restrict p_begin; \
189 \
190 dim_t iter_dim; \
191 dim_t n_iter; \
192 dim_t it, ic, ip; \
193 doff_t ic_inc, ip_inc; \
194 dim_t panel_len_full; \
195 dim_t panel_len_i; \
196 dim_t panel_len_max; \
197 dim_t panel_len_max_i; \
198 dim_t panel_dim_i; \
199 dim_t panel_dim_max; \
200 inc_t vs_c; \
201 inc_t p_inc; \
202 dim_t* m_panel_use; \
203 dim_t* n_panel_use; \
204 dim_t* m_panel_max; \
205 dim_t* n_panel_max; \
206 conj_t conjc; \
207 bool row_stored; \
208 bool col_stored; \
209 \
210 ctype_c* restrict c_use; \
211 ctype_p* restrict p_use; \
212 \
213 \
214 /* Extract the conjugation bit from the transposition argument. */ \
215 conjc = bli_extract_conj( transc ); \
216 \
217 /* If c needs a transposition, induce it so that we can more simply
218 express the remaining parameters and code. */ \
219 if ( bli_does_trans( transc ) ) \
220 { \
221 bli_swap_incs( &rs_c, &cs_c ); \
222 bli_toggle_trans( &transc ); \
223 } \
224 \
225 /* Create flags to incidate row or column storage. Note that the
226 schema bit that encodes row or column is describing the form of
227 micro-panel, not the storage in the micro-panel. Hence the
228 mismatch in "row" and "column" semantics. */ \
229 row_stored = bli_is_col_packed( schema ); \
230 col_stored = bli_is_row_packed( schema ); \
231 \
232 ( void )col_stored; \
233 \
234 /* If the row storage flag indicates row storage, then we are packing
235 to column panels; otherwise, if the strides indicate column storage,
236 we are packing to row panels. */ \
237 if ( row_stored ) \
238 { \
239 /* Prepare to pack to row-stored column panels. */ \
240 iter_dim = n; \
241 panel_len_full = m; \
242 panel_len_max = m_max; \
243 panel_dim_max = pd_p; \
244 vs_c = cs_c; \
245 m_panel_use = &panel_len_i; \
246 n_panel_use = &panel_dim_i; \
247 m_panel_max = &panel_len_max_i; \
248 n_panel_max = &panel_dim_max; \
249 } \
250 else /* if ( col_stored ) */ \
251 { \
252 /* Prepare to pack to column-stored row panels. */ \
253 iter_dim = m; \
254 panel_len_full = n; \
255 panel_len_max = n_max; \
256 panel_dim_max = pd_p; \
257 vs_c = rs_c; \
258 m_panel_use = &panel_dim_i; \
259 n_panel_use = &panel_len_i; \
260 m_panel_max = &panel_dim_max; \
261 n_panel_max = &panel_len_max_i; \
262 } \
263 \
264 /* Compute the total number of iterations we'll need. */ \
265 n_iter = iter_dim / panel_dim_max + ( iter_dim % panel_dim_max ? 1 : 0 ); \
266 \
267 { \
268 ic_inc = panel_dim_max; \
269 ip_inc = 1; \
270 } \
271 \
272 p_begin = p_cast; \
273 \
274 /* Query the number of threads and thread ids from the current thread's
275 packm thrinfo_t node. */ \
276 const dim_t nt = bli_thread_n_way( thread ); \
277 const dim_t tid = bli_thread_work_id( thread ); \
278 \
279 /* Suppress unused variable warnings when slab partitioning is enabled,
280 since the slab-based definition of bli_packm_my_iter() does not
281 actually use tid or nt. */ \
282 ( void )nt; ( void )tid; \
283 \
284 dim_t it_start, it_end, it_inc; \
285 \
286 /* Determine the thread range and increment using the current thread's
287 packm thrinfo_t node. NOTE: The definition of bli_thread_range_jrir()
288 will depend on whether slab or round-robin partitioning was requested
289 at configure-time. */ \
290 bli_thread_range_jrir( thread, n_iter, 1, FALSE, &it_start, &it_end, &it_inc ); \
291 \
292 for ( ic = 0, ip = 0, it = 0; it < n_iter; \
293 ic += ic_inc, ip += ip_inc, it += 1 ) \
294 { \
295 panel_dim_i = bli_min( panel_dim_max, iter_dim - ic ); \
296 \
297 c_begin = c_cast + (ic )*vs_c; \
298 \
299 { \
300 c_use = c_begin; \
301 p_use = p_begin; \
302 \
303 panel_len_i = panel_len_full; \
304 panel_len_max_i = panel_len_max; \
305 \
306 if ( bli_packm_my_iter( it, it_start, it_end, tid, nt ) ) \
307 { \
308 PASTEMAC2(chc,chp,packm_struc_cxk_md) \
309 ( \
310 conjc, \
311 schema, \
312 *m_panel_use, \
313 *n_panel_use, \
314 *m_panel_max, \
315 *n_panel_max, \
316 kappa_cast, \
317 c_use, rs_c, cs_c, \
318 p_use, rs_p, cs_p, \
319 is_p, \
320 cntx \
321 ); \
322 } \
323 \
324 p_inc = ps_p; \
325 } \
326 \
327 /*
328 if ( row_stored ) \
329 PASTEMAC(chp,fprintm)( stdout, "packm_blk_var1_md: b packed", *m_panel_max, *n_panel_max, \
330 p_use, rs_p, cs_p, "%5.2f", "" ); \
331 else \
332 PASTEMAC(chp,fprintm)( stdout, "packm_blk_var1_md: a packed", *m_panel_max, *n_panel_max, \
333 p_use, rs_p, cs_p, "%5.2f", "" ); \
334 */ \
335 \
336 p_begin += p_inc; \
337 \
338 } \
339 }
340
341 INSERT_GENTFUNC2_BASIC0( packm_blk_var1_md )
342 INSERT_GENTFUNC2_MIXDP0( packm_blk_var1_md )
343
344 #endif
345