1 /*
2
3 BLIS
4 An object-based framework for developing high-performance BLAS-like
5 libraries.
6
7 Copyright (C) 2014, The University of Texas at Austin
8
9 Redistribution and use in source and binary forms, with or without
10 modification, are permitted provided that the following conditions are
11 met:
12 - Redistributions of source code must retain the above copyright
13 notice, this list of conditions and the following disclaimer.
14 - Redistributions in binary form must reproduce the above copyright
15 notice, this list of conditions and the following disclaimer in the
16 documentation and/or other materials provided with the distribution.
17 - Neither the name(s) of the copyright holder(s) nor the names of its
18 contributors may be used to endorse or promote products derived
19 from this software without specific prior written permission.
20
21 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32
33 */
34
35 #include "blis.h"
36
37 #define FUNCPTR_T packm_fp
38
39 typedef void (*FUNCPTR_T)(
40 struc_t strucc,
41 doff_t diagoffc,
42 diag_t diagc,
43 uplo_t uploc,
44 trans_t transc,
45 dim_t m,
46 dim_t n,
47 dim_t m_max,
48 dim_t n_max,
49 void* kappa,
50 void* c, inc_t rs_c, inc_t cs_c,
51 void* p, inc_t rs_p, inc_t cs_p,
52 cntx_t* cntx
53 );
54
55 static FUNCPTR_T GENARRAY(ftypes,packm_unb_var1);
56
57
bli_packm_unb_var1(obj_t * c,obj_t * p,cntx_t * cntx,cntl_t * cntl,thrinfo_t * thread)58 void bli_packm_unb_var1
59 (
60 obj_t* c,
61 obj_t* p,
62 cntx_t* cntx,
63 cntl_t* cntl,
64 thrinfo_t* thread
65 )
66 {
67 num_t dt_cp = bli_obj_dt( c );
68
69 struc_t strucc = bli_obj_struc( c );
70 doff_t diagoffc = bli_obj_diag_offset( c );
71 diag_t diagc = bli_obj_diag( c );
72 uplo_t uploc = bli_obj_uplo( c );
73 trans_t transc = bli_obj_conjtrans_status( c );
74
75 dim_t m_p = bli_obj_length( p );
76 dim_t n_p = bli_obj_width( p );
77 dim_t m_max_p = bli_obj_padded_length( p );
78 dim_t n_max_p = bli_obj_padded_width( p );
79
80 void* buf_c = bli_obj_buffer_at_off( c );
81 inc_t rs_c = bli_obj_row_stride( c );
82 inc_t cs_c = bli_obj_col_stride( c );
83
84 void* buf_p = bli_obj_buffer_at_off( p );
85 inc_t rs_p = bli_obj_row_stride( p );
86 inc_t cs_p = bli_obj_col_stride( p );
87
88 void* buf_kappa;
89
90 FUNCPTR_T f;
91
92
93 // This variant assumes that the computational kernel will always apply
94 // the alpha scalar of the higher-level operation. Thus, we use BLIS_ONE
95 // for kappa so that the underlying packm implementation does not scale
96 // during packing.
97 buf_kappa = bli_obj_buffer_for_const( dt_cp, &BLIS_ONE );
98
99 // Index into the type combination array to extract the correct
100 // function pointer.
101 f = ftypes[dt_cp];
102
103 if( bli_thread_am_ochief( thread ) ) {
104 // Invoke the function.
105 f
106 (
107 strucc,
108 diagoffc,
109 diagc,
110 uploc,
111 transc,
112 m_p,
113 n_p,
114 m_max_p,
115 n_max_p,
116 buf_kappa,
117 buf_c, rs_c, cs_c,
118 buf_p, rs_p, cs_p,
119 cntx
120 );
121 }
122 }
123
124
125 #undef GENTFUNC
126 #define GENTFUNC( ctype, ch, varname ) \
127 \
128 void PASTEMAC(ch,varname) \
129 ( \
130 struc_t strucc, \
131 doff_t diagoffc, \
132 diag_t diagc, \
133 uplo_t uploc, \
134 trans_t transc, \
135 dim_t m, \
136 dim_t n, \
137 dim_t m_max, \
138 dim_t n_max, \
139 void* kappa, \
140 void* c, inc_t rs_c, inc_t cs_c, \
141 void* p, inc_t rs_p, inc_t cs_p, \
142 cntx_t* cntx \
143 ) \
144 { \
145 ctype* restrict kappa_cast = kappa; \
146 ctype* restrict c_cast = c; \
147 ctype* restrict p_cast = p; \
148 ctype* restrict zero = PASTEMAC(ch,0); \
149 \
150 /* We begin by packing the region indicated by the parameters. If
151 matrix c is dense (either because the structure is general or
152 because the structure has already been "densified"), this ends
153 up being the only action we take. Note that if kappa is unit,
154 the data is simply copied (rather than scaled by one). */ \
155 PASTEMAC2(ch,scal2m,BLIS_TAPI_EX_SUF) \
156 ( \
157 diagoffc, \
158 diagc, \
159 uploc, \
160 transc, \
161 m, \
162 n, \
163 kappa_cast, \
164 c_cast, rs_c, cs_c, \
165 p_cast, rs_p, cs_p, \
166 cntx, \
167 NULL \
168 ); \
169 \
170 /* If uploc is upper or lower, then the structure of c is necessarily
171 non-dense (ie: Hermitian, symmetric, or triangular, where part of the
172 matrix is unstored). In these cases, we want to fill in the unstored
173 part of the matrix. How this is done depends on the structure of c. */ \
174 if ( bli_is_upper_or_lower( uploc ) ) \
175 { \
176 /* The Hermitian and symmetric cases are almost identical, so we
177 handle them in one conditional block. */ \
178 if ( bli_is_hermitian( strucc ) || bli_is_symmetric( strucc ) ) \
179 { \
180 /* First we must reflect the region referenced to the opposite
181 side of the diagonal. */ \
182 c_cast = c_cast + diagoffc * ( doff_t )cs_c + \
183 -diagoffc * ( doff_t )rs_c; \
184 bli_negate_diag_offset( &diagoffc ); \
185 bli_toggle_trans( &transc ); \
186 if ( bli_is_upper( uploc ) ) diagoffc += 1; \
187 else if ( bli_is_lower( uploc ) ) diagoffc -= 1; \
188 \
189 /* If c is Hermitian, we need to apply a conjugation when
190 copying the region opposite the diagonal. */ \
191 if ( bli_is_hermitian( strucc ) ) \
192 transc = bli_trans_toggled_conj( transc ); \
193 \
194 /* Copy the data from the region opposite the diagonal of c
195 (as specified by the original value of diagoffc). Notice
196 that we use a diag parameter of non-unit since we can
197 assume nothing about the neighboring off-diagonal. */ \
198 PASTEMAC2(ch,scal2m,BLIS_TAPI_EX_SUF) \
199 ( \
200 diagoffc, \
201 BLIS_NONUNIT_DIAG, \
202 uploc, \
203 transc, \
204 m, \
205 n, \
206 kappa_cast, \
207 c_cast, rs_c, cs_c, \
208 p_cast, rs_p, cs_p, \
209 cntx, \
210 NULL \
211 ); \
212 } \
213 else /* if ( bli_is_triangular( strucc ) ) */ \
214 { \
215 doff_t diagoffp = diagoffc; \
216 uplo_t uplop = uploc; \
217 \
218 /* For this step we need the uplo and diagonal offset of p, which
219 we can derive from the parameters given. */ \
220 if ( bli_does_trans( transc ) ) \
221 { \
222 bli_negate_diag_offset( &diagoffp ); \
223 bli_toggle_uplo( &uplop ); \
224 } \
225 \
226 /* For triangular matrices, we wish to reference the region
227 strictly opposite the diagonal of C. This amounts to
228 toggling uploc and then shifting the diagonal offset to
229 shrink the stored region (by one diagonal). */ \
230 bli_toggle_uplo( &uplop ); \
231 bli_shift_diag_offset_to_shrink_uplo( uplop, &diagoffp ); \
232 \
233 /* Set the region opposite the diagonal of p to zero. */ \
234 PASTEMAC2(ch,setm,BLIS_TAPI_EX_SUF) \
235 ( \
236 BLIS_NO_CONJUGATE, \
237 diagoffp, \
238 BLIS_NONUNIT_DIAG, \
239 uplop, \
240 m, \
241 n, \
242 zero, \
243 p_cast, rs_p, cs_p, \
244 cntx, \
245 NULL \
246 ); \
247 } \
248 } \
249 \
250 /* The packed memory region was acquired/allocated with "aligned"
251 dimensions (ie: dimensions that were possibly inflated up to a
252 multiple). When these dimension are inflated, it creates empty
253 regions along the bottom and/or right edges of the matrix. If
254 eithe region exists, we set them to zero. This simplifies the
255 register level micro kernel in that it does not need to support
256 different register blockings for the edge cases. */ \
257 if ( m != m_max ) \
258 { \
259 ctype* p_edge = p_cast + (m )*rs_p; \
260 \
261 PASTEMAC2(ch,setm,BLIS_TAPI_EX_SUF) \
262 ( \
263 BLIS_NO_CONJUGATE, \
264 0, \
265 BLIS_NONUNIT_DIAG, \
266 BLIS_DENSE, \
267 m_max - m, \
268 n_max, \
269 zero, \
270 p_edge, rs_p, cs_p, \
271 cntx, \
272 NULL \
273 ); \
274 } \
275 \
276 if ( n != n_max ) \
277 { \
278 ctype* p_edge = p_cast + (n )*cs_p; \
279 \
280 PASTEMAC2(ch,setm,BLIS_TAPI_EX_SUF) \
281 ( \
282 BLIS_NO_CONJUGATE, \
283 0, \
284 BLIS_NONUNIT_DIAG, \
285 BLIS_DENSE, \
286 m_max, \
287 n_max - n, \
288 zero, \
289 p_edge, rs_p, cs_p, \
290 cntx, \
291 NULL \
292 ); \
293 } \
294 }
295
296 INSERT_GENTFUNC_BASIC0( packm_unb_var1 )
297
298