1 /*
2 * -- High Performance Computing Linpack Benchmark (HPL)
3 * HPL - 2.3 - December 2, 2018
4 * Antoine P. Petitet
5 * University of Tennessee, Knoxville
6 * Innovative Computing Laboratory
7 * (C) Copyright 2000-2008 All Rights Reserved
8 *
9 * -- Copyright notice and Licensing terms:
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 *
15 * 1. Redistributions of source code must retain the above copyright
16 * notice, this list of conditions and the following disclaimer.
17 *
18 * 2. Redistributions in binary form must reproduce the above copyright
19 * notice, this list of conditions, and the following disclaimer in the
20 * documentation and/or other materials provided with the distribution.
21 *
22 * 3. All advertising materials mentioning features or use of this
23 * software must display the following acknowledgement:
24 * This product includes software developed at the University of
25 * Tennessee, Knoxville, Innovative Computing Laboratory.
26 *
27 * 4. The name of the University, the name of the Laboratory, or the
28 * names of its contributors may not be used to endorse or promote
29 * products derived from this software without specific written
30 * permission.
31 *
32 * -- Disclaimer:
33 *
34 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
35 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
36 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
37 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY
38 * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
39 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
40 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
41 * DATA OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
42 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
43 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
44 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
45 * ---------------------------------------------------------------------
46 */
47 /*
48 * Include files
49 */
50 #include "hpl.h"
51
52 #ifdef STDC_HEADERS
HPL_pdpancrN(HPL_T_panel * PANEL,const int M,const int N,const int ICOFF,double * WORK)53 void HPL_pdpancrN
54 (
55 HPL_T_panel * PANEL,
56 const int M,
57 const int N,
58 const int ICOFF,
59 double * WORK
60 )
61 #else
62 void HPL_pdpancrN
63 ( PANEL, M, N, ICOFF, WORK )
64 HPL_T_panel * PANEL;
65 const int M;
66 const int N;
67 const int ICOFF;
68 double * WORK;
69 #endif
70 {
71 /*
72 * Purpose
73 * =======
74 *
75 * HPL_pdpancrN factorizes a panel of columns that is a sub-array of a
76 * larger one-dimensional panel A using the Crout variant of the usual
77 * one-dimensional algorithm. The lower triangular N0-by-N0 upper block
78 * of the panel is stored in no-transpose form (i.e. just like the input
79 * matrix itself).
80 *
81 * Bi-directional exchange is used to perform the swap::broadcast
82 * operations at once for one column in the panel. This results in a
83 * lower number of slightly larger messages than usual. On P processes
84 * and assuming bi-directional links, the running time of this function
85 * can be approximated by (when N is equal to N0):
86 *
87 * N0 * log_2( P ) * ( lat + ( 2*N0 + 4 ) / bdwth ) +
88 * N0^2 * ( M - N0/3 ) * gam2-3
89 *
90 * where M is the local number of rows of the panel, lat and bdwth are
91 * the latency and bandwidth of the network for double precision real
92 * words, and gam2-3 is an estimate of the Level 2 and Level 3 BLAS
93 * rate of execution. The recursive algorithm allows indeed to almost
94 * achieve Level 3 BLAS performance in the panel factorization. On a
95 * large number of modern machines, this operation is however latency
96 * bound, meaning that its cost can be estimated by only the latency
97 * portion N0 * log_2(P) * lat. Mono-directional links will double this
98 * communication cost.
99 *
100 * Note that one iteration of the the main loop is unrolled. The local
101 * computation of the absolute value max of the next column is performed
102 * just after its update by the current column. This allows to bring the
103 * current column only once through cache at each step. The current
104 * implementation does not perform any blocking for this sequence of
105 * BLAS operations, however the design allows for plugging in an optimal
106 * (machine-specific) specialized BLAS-like kernel. This idea has been
107 * suggested to us by Fred Gustavson, IBM T.J. Watson Research Center.
108 *
109 * Arguments
110 * =========
111 *
112 * PANEL (local input/output) HPL_T_panel *
113 * On entry, PANEL points to the data structure containing the
114 * panel information.
115 *
116 * M (local input) const int
117 * On entry, M specifies the local number of rows of sub(A).
118 *
119 * N (local input) const int
120 * On entry, N specifies the local number of columns of sub(A).
121 *
122 * ICOFF (global input) const int
123 * On entry, ICOFF specifies the row and column offset of sub(A)
124 * in A.
125 *
126 * WORK (local workspace) double *
127 * On entry, WORK is a workarray of size at least 2*(4+2*N0).
128 *
129 * ---------------------------------------------------------------------
130 */
131 /*
132 * .. Local Variables ..
133 */
134 double * A, * L1, * L1ptr;
135 #ifdef HPL_CALL_VSIPL
136 vsip_mview_d * Av0, * Av1, * Yv1, * Xv0, * Xv1;
137 #endif
138 int Mm1, Nm1, curr, ii, iip1, jj, kk=0, lda,
139 m=M, n0;
140 /* ..
141 * .. Executable Statements ..
142 */
143 #ifdef HPL_DETAILED_TIMING
144 HPL_ptimer( HPL_TIMING_PFACT );
145 #endif
146 A = PANEL->A; lda = PANEL->lda;
147 L1 = PANEL->L1; n0 = PANEL->jb;
148 curr = (int)( PANEL->grid->myrow == PANEL->prow );
149
150 Nm1 = N - 1; jj = ICOFF;
151 if( curr != 0 ) { ii = ICOFF; iip1 = ii+1; Mm1 = m-1; }
152 else { ii = 0; iip1 = ii; Mm1 = m; }
153 #ifdef HPL_CALL_VSIPL
154 /*
155 * Admit the blocks
156 */
157 (void) vsip_blockadmit_d( PANEL->Ablock, VSIP_TRUE );
158 (void) vsip_blockadmit_d( PANEL->L1block, VSIP_TRUE );
159 /*
160 * Create the matrix views
161 */
162 Av0 = vsip_mbind_d( PANEL->Ablock, 0, 1, lda, lda, PANEL->pmat->nq );
163 Xv0 = vsip_mbind_d( PANEL->L1block, 0, 1, PANEL->jb, PANEL->jb, PANEL->jb );
164 #endif
165 /*
166 * Find local absolute value max in first column - initialize WORK[0:3]
167 */
168 HPL_dlocmax( PANEL, m, ii, jj, WORK );
169
170 while( Nm1 > 0 )
171 {
172 /*
173 * Swap and broadcast the current row
174 */
175 HPL_pdmxswp( PANEL, m, ii, jj, WORK );
176 HPL_dlocswpN( PANEL, ii, jj, WORK );
177 /*
178 * Compute row (column) jj of L1
179 */
180 if( kk > 0 )
181 {
182 L1ptr = Mptr( L1, jj, jj+1, n0 );
183 #ifdef HPL_CALL_VSIPL
184 /*
185 * Create the matrix subviews
186 */
187 Av1 = vsip_msubview_d( Xv0, ICOFF, jj+1, kk, Nm1 );
188 Xv1 = vsip_msubview_d( Xv0, jj, ICOFF, 1, kk );
189 Yv1 = vsip_msubview_d( Xv0, jj, jj+1, 1, Nm1 );
190
191 vsip_gemp_d( -HPL_rone, Xv1, VSIP_MAT_NTRANS, Av1, VSIP_MAT_NTRANS,
192 HPL_rone, Yv1 );
193 /*
194 * Destroy the matrix subviews
195 */
196 (void) vsip_mdestroy_d( Yv1 );
197 (void) vsip_mdestroy_d( Xv1 );
198 (void) vsip_mdestroy_d( Av1 );
199 #else
200 HPL_dgemv( HplColumnMajor, HplTrans, kk, Nm1, -HPL_rone,
201 Mptr( L1, ICOFF, jj+1, n0 ), n0, Mptr( L1, jj,
202 ICOFF, n0 ), n0, HPL_rone, L1ptr, n0 );
203 #endif
204 if( curr != 0 )
205 HPL_dcopy( Nm1, L1ptr, n0, Mptr( A, ii, jj+1, lda ), lda );
206 }
207 /*
208 * Scale current column by its absolute value max entry - Update dia-
209 * diagonal and subdiagonal elements in column A(iip1:iip1+Mm1-1, jj+1)
210 * and find local absolute value max in that column (Only one pass
211 * through cache for each current column). This sequence of operations
212 * could benefit from a specialized blocked implementation.
213 */
214 if( WORK[0] != HPL_rzero )
215 HPL_dscal( Mm1, HPL_rone / WORK[0], Mptr( A, iip1, jj, lda ), 1 );
216 #ifdef HPL_CALL_VSIPL
217 /*
218 * Create the matrix subviews
219 */
220 Av1 = vsip_msubview_d( Av0, PANEL->ii+iip1, PANEL->jj+ICOFF, Mm1, kk+1 );
221 Xv1 = vsip_msubview_d( Xv0, ICOFF, jj+1, kk+1, 1 );
222 Yv1 = vsip_msubview_d( Av0, PANEL->ii+iip1, PANEL->jj+jj+1, Mm1, 1 );
223
224 vsip_gemp_d( -HPL_rone, Av1, VSIP_MAT_NTRANS, Xv1, VSIP_MAT_NTRANS,
225 HPL_rone, Yv1 );
226 /*
227 * Destroy the matrix subviews
228 */
229 vsip_mdestroy_d( Yv1 );
230 vsip_mdestroy_d( Xv1 );
231 vsip_mdestroy_d( Av1 );
232 #else
233 HPL_dgemv( HplColumnMajor, HplNoTrans, Mm1, kk+1, -HPL_rone,
234 Mptr( A, iip1, ICOFF, lda ), lda, Mptr( L1, ICOFF,
235 jj+1, n0 ), 1, HPL_rone, Mptr( A, iip1, jj+1, lda ),
236 1 );
237 #endif
238 HPL_dlocmax( PANEL, Mm1, iip1, jj+1, WORK );
239 if( curr != 0 ) { ii = iip1; iip1++; m = Mm1; Mm1--; }
240
241 Nm1--; jj++; kk++;
242 }
243 /*
244 * Swap and broadcast last row - Scale last column by its absolute value
245 * max entry
246 */
247 HPL_pdmxswp( PANEL, m, ii, jj, WORK );
248 HPL_dlocswpN( PANEL, ii, jj, WORK );
249 if( WORK[0] != HPL_rzero )
250 HPL_dscal( Mm1, HPL_rone / WORK[0], Mptr( A, iip1, jj, lda ), 1 );
251
252 #ifdef HPL_CALL_VSIPL
253 /*
254 * Release the blocks
255 */
256 (void) vsip_blockrelease_d( vsip_mgetblock_d( Xv0 ), VSIP_TRUE );
257 (void) vsip_blockrelease_d( vsip_mgetblock_d( Av0 ), VSIP_TRUE );
258 /*
259 * Destroy the matrix views
260 */
261 (void) vsip_mdestroy_d( Xv0 );
262 (void) vsip_mdestroy_d( Av0 );
263 #endif
264 #ifdef HPL_DETAILED_TIMING
265 HPL_ptimer( HPL_TIMING_PFACT );
266 #endif
267 /*
268 * End of HPL_pdpancrN
269 */
270 }
271