1 /*
2 * -- High Performance Computing Linpack Benchmark (HPL)
3 * HPL - 2.3 - December 2, 2018
4 * Antoine P. Petitet
5 * University of Tennessee, Knoxville
6 * Innovative Computing Laboratory
7 * (C) Copyright 2000-2008 All Rights Reserved
8 *
9 * -- Copyright notice and Licensing terms:
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 *
15 * 1. Redistributions of source code must retain the above copyright
16 * notice, this list of conditions and the following disclaimer.
17 *
18 * 2. Redistributions in binary form must reproduce the above copyright
19 * notice, this list of conditions, and the following disclaimer in the
20 * documentation and/or other materials provided with the distribution.
21 *
22 * 3. All advertising materials mentioning features or use of this
23 * software must display the following acknowledgement:
24 * This product includes software developed at the University of
25 * Tennessee, Knoxville, Innovative Computing Laboratory.
26 *
27 * 4. The name of the University, the name of the Laboratory, or the
28 * names of its contributors may not be used to endorse or promote
29 * products derived from this software without specific written
30 * permission.
31 *
32 * -- Disclaimer:
33 *
34 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
35 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
36 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
37 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY
38 * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
39 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
40 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
41 * DATA OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
42 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
43 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
44 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
45 * ---------------------------------------------------------------------
46 */
47 /*
48 * Include files
49 */
50 #include "hpl.h"
51
52 #ifdef STDC_HEADERS
HPL_pdrpanllN(HPL_T_panel * PANEL,const int M,const int N,const int ICOFF,double * WORK)53 void HPL_pdrpanllN
54 (
55 HPL_T_panel * PANEL,
56 const int M,
57 const int N,
58 const int ICOFF,
59 double * WORK
60 )
61 #else
62 void HPL_pdrpanllN
63 ( PANEL, M, N, ICOFF, WORK )
64 HPL_T_panel * PANEL;
65 const int M;
66 const int N;
67 const int ICOFF;
68 double * WORK;
69 #endif
70 {
71 /*
72 * Purpose
73 * =======
74 *
75 * HPL_pdrpanllN recursively factorizes a panel of columns using the
76 * recursive Left-looking variant of the one-dimensional algorithm. The
77 * lower triangular N0-by-N0 upper block of the panel is stored in
78 * no-transpose form (i.e. just like the input matrix itself).
79 *
80 * Bi-directional exchange is used to perform the swap::broadcast
81 * operations at once for one column in the panel. This results in a
82 * lower number of slightly larger messages than usual. On P processes
83 * and assuming bi-directional links, the running time of this function
84 * can be approximated by (when N is equal to N0):
85 *
86 * N0 * log_2( P ) * ( lat + ( 2*N0 + 4 ) / bdwth ) +
87 * N0^2 * ( M - N0/3 ) * gam2-3
88 *
89 * where M is the local number of rows of the panel, lat and bdwth are
90 * the latency and bandwidth of the network for double precision real
91 * words, and gam2-3 is an estimate of the Level 2 and Level 3 BLAS
92 * rate of execution. The recursive algorithm allows indeed to almost
93 * achieve Level 3 BLAS performance in the panel factorization. On a
94 * large number of modern machines, this operation is however latency
95 * bound, meaning that its cost can be estimated by only the latency
96 * portion N0 * log_2(P) * lat. Mono-directional links will double this
97 * communication cost.
98 *
99 * Arguments
100 * =========
101 *
102 * PANEL (local input/output) HPL_T_panel *
103 * On entry, PANEL points to the data structure containing the
104 * panel information.
105 *
106 * M (local input) const int
107 * On entry, M specifies the local number of rows of sub(A).
108 *
109 * N (local input) const int
110 * On entry, N specifies the local number of columns of sub(A).
111 *
112 * ICOFF (global input) const int
113 * On entry, ICOFF specifies the row and column offset of sub(A)
114 * in A.
115 *
116 * WORK (local workspace) double *
117 * On entry, WORK is a workarray of size at least 2*(4+2*N0).
118 *
119 * ---------------------------------------------------------------------
120 */
121 /*
122 * .. Local Variables ..
123 */
124 double * A, * Aptr, * L1, * L1ptr;
125 #ifdef HPL_CALL_VSIPL
126 vsip_mview_d * Av0, * Lv0, * Av1, * Av2, * Lv1;
127 #endif
128 int curr, ii, ioff, jb, jj, lda, m, n, n0, nb,
129 nbdiv, nbmin;
130 /* ..
131 * .. Executable Statements ..
132 */
133 if( N <= ( nbmin = PANEL->algo->nbmin ) )
134 { PANEL->algo->pffun( PANEL, M, N, ICOFF, WORK ); return; }
135 /*
136 * Find new recursive blocking factor. To avoid an infinite loop, one
137 * must guarantee: 1 <= jb < N, knowing that N is greater than NBMIN.
138 * First, we compute nblocks: the number of blocks of size NBMIN in N,
139 * including the last one that may be smaller. nblocks is thus larger
140 * than or equal to one, since N >= NBMIN.
141 * The ratio ( nblocks + NDIV - 1 ) / NDIV is thus larger than or equal
142 * to one as well. For NDIV >= 2, we are guaranteed that the quan-
143 * tity ( ( nblocks + NDIV - 1 ) / NDIV ) * NBMIN is less than N and
144 * greater than or equal to NBMIN.
145 */
146 nbdiv = PANEL->algo->nbdiv; ii = jj = 0; m = M; n = N;
147 nb = jb = ( (((N+nbmin-1) / nbmin) + nbdiv - 1) / nbdiv ) * nbmin;
148
149 A = PANEL->A; lda = PANEL->lda;
150 L1 = PANEL->L1; n0 = PANEL->jb;
151 L1ptr = Mptr( L1, ICOFF, ICOFF, n0 );
152 curr = (int)( PANEL->grid->myrow == PANEL->prow );
153
154 if( curr != 0 ) Aptr = Mptr( A, ICOFF, ICOFF, lda );
155 else Aptr = Mptr( A, 0, ICOFF, lda );
156 /*
157 * The triangular solve is replicated in every process row. The panel
158 * factorization is such that the first rows of A are accumulated in
159 * every process row during the (panel) swapping phase. We ensure this
160 * way a minimum amount of communication during the entire panel facto-
161 * rization.
162 */
163 do
164 {
165 n -= jb; ioff = ICOFF + jj;
166 /*
167 * Replicated solve - Local update - Factor current panel
168 */
169 HPL_dtrsm( HplColumnMajor, HplLeft, HplLower, HplNoTrans, HplUnit,
170 jj, jb, HPL_rone, L1ptr, n0, Mptr( L1ptr, 0, jj, n0 ),
171 n0 );
172 #ifdef HPL_CALL_VSIPL
173 /*
174 * Admit the blocks
175 */
176 (void) vsip_blockadmit_d( PANEL->Ablock, VSIP_TRUE );
177 (void) vsip_blockadmit_d( PANEL->L1block, VSIP_TRUE );
178 /*
179 * Create the matrix views
180 */
181 Av0 = vsip_mbind_d( PANEL->Ablock, 0, 1, lda, lda, PANEL->pmat->nq );
182 Lv0 = vsip_mbind_d( PANEL->L1block, 0, 1, n0, n0, n0 );
183 /*
184 * Create the matrix subviews
185 */
186 if( curr != 0 )
187 {
188 Av1 = vsip_msubview_d( Av0, PANEL->ii+ICOFF+ii, PANEL->jj+ICOFF,
189 m, jj );
190 Av2 = vsip_msubview_d( Av0, PANEL->ii+ICOFF+ii, PANEL->jj+ioff,
191 m, jj );
192 }
193 else
194 {
195 Av1 = vsip_msubview_d( Av0, PANEL->ii+ii, PANEL->jj+ICOFF, m, jj );
196 Av2 = vsip_msubview_d( Av0, PANEL->ii+ii, PANEL->jj+ioff, m, jj );
197 }
198 Lv1 = vsip_msubview_d( Lv0, ICOFF, ioff, jj, jb );
199
200 vsip_gemp_d( -HPL_rone, Av1, VSIP_MAT_NTRANS, Lv1, VSIP_MAT_NTRANS,
201 HPL_rone, Av2 );
202 /*
203 * Destroy the matrix subviews
204 */
205 (void) vsip_mdestroy_d( Lv1 );
206 (void) vsip_mdestroy_d( Av2 );
207 (void) vsip_mdestroy_d( Av1 );
208 /*
209 * Release the blocks
210 */
211 (void) vsip_blockrelease_d( vsip_mgetblock_d( Lv0 ), VSIP_TRUE );
212 (void) vsip_blockrelease_d( vsip_mgetblock_d( Av0 ), VSIP_TRUE );
213 /*
214 * Destroy the matrix views
215 */
216 (void) vsip_mdestroy_d( Lv0 );
217 (void) vsip_mdestroy_d( Av0 );
218 #else
219 HPL_dgemm( HplColumnMajor, HplNoTrans, HplNoTrans, m, jb,
220 jj, -HPL_rone, Mptr( Aptr, ii, 0, lda ), lda,
221 Mptr( L1ptr, 0, jj, n0 ), n0, HPL_rone,
222 Mptr( Aptr, ii, jj, lda ), lda );
223 #endif
224 HPL_pdrpanllN( PANEL, m, jb, ioff, WORK );
225 /*
226 * Copy back upper part of A in current process row - Go the next block
227 */
228 if( curr != 0 )
229 {
230 HPL_dlacpy( ioff, jb, Mptr( L1, 0, ioff, n0 ), n0,
231 Mptr( A, 0, ioff, lda ), lda );
232 ii += jb; m -= jb;
233 }
234 jj += jb; jb = Mmin( n, nb );
235
236 } while( n > 0 );
237 /*
238 * End of HPL_pdrpanllN
239 */
240 }
241