1 /*
2  * -- High Performance Computing Linpack Benchmark (HPL)
3  *    HPL - 2.3 - December 2, 2018
4  *    Antoine P. Petitet
5  *    University of Tennessee, Knoxville
6  *    Innovative Computing Laboratory
7  *    (C) Copyright 2000-2008 All Rights Reserved
8  *
9  * -- Copyright notice and Licensing terms:
10  *
11  * Redistribution  and  use in  source and binary forms, with or without
12  * modification, are  permitted provided  that the following  conditions
13  * are met:
14  *
15  * 1. Redistributions  of  source  code  must retain the above copyright
16  * notice, this list of conditions and the following disclaimer.
17  *
18  * 2. Redistributions in binary form must reproduce  the above copyright
19  * notice, this list of conditions,  and the following disclaimer in the
20  * documentation and/or other materials provided with the distribution.
21  *
22  * 3. All  advertising  materials  mentioning  features  or  use of this
23  * software must display the following acknowledgement:
24  * This  product  includes  software  developed  at  the  University  of
25  * Tennessee, Knoxville, Innovative Computing Laboratory.
26  *
27  * 4. The name of the  University,  the name of the  Laboratory,  or the
28  * names  of  its  contributors  may  not  be used to endorse or promote
29  * products  derived   from   this  software  without  specific  written
30  * permission.
31  *
32  * -- Disclaimer:
33  *
34  * THIS  SOFTWARE  IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
35  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,  INCLUDING,  BUT NOT
36  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
37  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY
38  * OR  CONTRIBUTORS  BE  LIABLE FOR ANY  DIRECT,  INDIRECT,  INCIDENTAL,
39  * SPECIAL,  EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES  (INCLUDING,  BUT NOT
40  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
41  * DATA OR PROFITS; OR BUSINESS INTERRUPTION)  HOWEVER CAUSED AND ON ANY
42  * THEORY OF LIABILITY, WHETHER IN CONTRACT,  STRICT LIABILITY,  OR TORT
43  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
44  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
45  * ---------------------------------------------------------------------
46  */
47 /*
48  * Include files
49  */
50 #include "hpl.h"
51 
52 #ifdef STDC_HEADERS
HPL_pdpancrN(HPL_T_panel * PANEL,const int M,const int N,const int ICOFF,double * WORK)53 void HPL_pdpancrN
54 (
55    HPL_T_panel *                    PANEL,
56    const int                        M,
57    const int                        N,
58    const int                        ICOFF,
59    double *                         WORK
60 )
61 #else
62 void HPL_pdpancrN
63 ( PANEL, M, N, ICOFF, WORK )
64    HPL_T_panel *                    PANEL;
65    const int                        M;
66    const int                        N;
67    const int                        ICOFF;
68    double *                         WORK;
69 #endif
70 {
71 /*
72  * Purpose
73  * =======
74  *
75  * HPL_pdpancrN factorizes  a panel of columns that is a sub-array of a
76  * larger one-dimensional panel  A using the Crout variant of the  usual
77  * one-dimensional algorithm.  The lower triangular N0-by-N0 upper block
78  * of the panel is stored in no-transpose form (i.e. just like the input
79  * matrix itself).
80  *
81  * Bi-directional  exchange  is  used  to  perform  the  swap::broadcast
82  * operations  at once  for one column in the panel.  This  results in a
83  * lower number of slightly larger  messages than usual.  On P processes
84  * and assuming bi-directional links,  the running time of this function
85  * can be approximated by (when N is equal to N0):
86  *
87  *    N0 * log_2( P ) * ( lat + ( 2*N0 + 4 ) / bdwth ) +
88  *    N0^2 * ( M - N0/3 ) * gam2-3
89  *
90  * where M is the local number of rows of  the panel, lat and bdwth  are
91  * the latency and bandwidth of the network for  double  precision  real
92  * words, and gam2-3 is  an  estimate  of the  Level 2 and Level 3  BLAS
93  * rate of execution. The  recursive  algorithm  allows indeed to almost
94  * achieve  Level 3 BLAS  performance  in the panel factorization.  On a
95  * large  number of modern machines,  this  operation is however latency
96  * bound,  meaning  that its cost can  be estimated  by only the latency
97  * portion N0 * log_2(P) * lat.  Mono-directional links will double this
98  * communication cost.
99  *
100  * Note that  one  iteration of the the main loop is unrolled. The local
101  * computation of the absolute value max of the next column is performed
102  * just after its update by the current column. This allows to bring the
103  * current column only  once through  cache at each  step.  The  current
104  * implementation  does not perform  any blocking  for  this sequence of
105  * BLAS operations, however the design allows for plugging in an optimal
106  * (machine-specific) specialized  BLAS-like kernel.  This idea has been
107  * suggested to us by Fred Gustavson, IBM T.J. Watson Research Center.
108  *
109  * Arguments
110  * =========
111  *
112  * PANEL   (local input/output)          HPL_T_panel *
113  *         On entry,  PANEL  points to the data structure containing the
114  *         panel information.
115  *
116  * M       (local input)                 const int
117  *         On entry,  M specifies the local number of rows of sub(A).
118  *
119  * N       (local input)                 const int
120  *         On entry,  N specifies the local number of columns of sub(A).
121  *
122  * ICOFF   (global input)                const int
123  *         On entry, ICOFF specifies the row and column offset of sub(A)
124  *         in A.
125  *
126  * WORK    (local workspace)             double *
127  *         On entry, WORK  is a workarray of size at least 2*(4+2*N0).
128  *
129  * ---------------------------------------------------------------------
130  */
131 /*
132  * .. Local Variables ..
133  */
134    double                     * A, * L1, * L1ptr;
135 #ifdef HPL_CALL_VSIPL
136    vsip_mview_d               * Av0, * Av1, * Yv1, * Xv0, * Xv1;
137 #endif
138    int                        Mm1, Nm1, curr, ii, iip1, jj, kk=0, lda,
139                               m=M, n0;
140 /* ..
141  * .. Executable Statements ..
142  */
143 #ifdef HPL_DETAILED_TIMING
144    HPL_ptimer( HPL_TIMING_PFACT );
145 #endif
146    A    = PANEL->A;   lda = PANEL->lda;
147    L1   = PANEL->L1;  n0  = PANEL->jb;
148    curr = (int)( PANEL->grid->myrow == PANEL->prow );
149 
150    Nm1  = N - 1; jj = ICOFF;
151    if( curr != 0 ) { ii = ICOFF; iip1 = ii+1; Mm1 = m-1; }
152    else            { ii = 0;     iip1 = ii;   Mm1 = m;   }
153 #ifdef HPL_CALL_VSIPL
154 /*
155  * Admit the blocks
156  */
157    (void) vsip_blockadmit_d( PANEL->Ablock,  VSIP_TRUE );
158    (void) vsip_blockadmit_d( PANEL->L1block, VSIP_TRUE );
159 /*
160  * Create the matrix views
161  */
162    Av0 = vsip_mbind_d( PANEL->Ablock,  0, 1, lda,       lda, PANEL->pmat->nq );
163    Xv0 = vsip_mbind_d( PANEL->L1block, 0, 1, PANEL->jb, PANEL->jb, PANEL->jb );
164 #endif
165 /*
166  * Find local absolute value max in first column - initialize WORK[0:3]
167  */
168    HPL_dlocmax( PANEL, m, ii, jj, WORK );
169 
170    while( Nm1 > 0 )
171    {
172 /*
173  * Swap and broadcast the current row
174  */
175       HPL_pdmxswp(  PANEL, m, ii, jj, WORK );
176       HPL_dlocswpN( PANEL,    ii, jj, WORK );
177 /*
178  * Compute row (column) jj of L1
179  */
180       if( kk > 0 )
181       {
182          L1ptr = Mptr( L1, jj, jj+1, n0 );
183 #ifdef HPL_CALL_VSIPL
184 /*
185  * Create the matrix subviews
186  */
187          Av1 = vsip_msubview_d( Xv0, ICOFF, jj+1,  kk, Nm1 );
188          Xv1 = vsip_msubview_d( Xv0, jj,    ICOFF, 1,  kk  );
189          Yv1 = vsip_msubview_d( Xv0, jj,    jj+1,  1,  Nm1 );
190 
191          vsip_gemp_d( -HPL_rone, Xv1, VSIP_MAT_NTRANS, Av1, VSIP_MAT_NTRANS,
192                       HPL_rone, Yv1 );
193 /*
194  * Destroy the matrix subviews
195  */
196          (void) vsip_mdestroy_d( Yv1 );
197          (void) vsip_mdestroy_d( Xv1 );
198          (void) vsip_mdestroy_d( Av1 );
199 #else
200          HPL_dgemv( HplColumnMajor, HplTrans, kk, Nm1, -HPL_rone,
201                     Mptr( L1, ICOFF, jj+1, n0 ), n0, Mptr( L1, jj,
202                     ICOFF, n0 ), n0, HPL_rone, L1ptr, n0 );
203 #endif
204          if( curr != 0 )
205             HPL_dcopy( Nm1, L1ptr, n0, Mptr( A, ii, jj+1, lda ), lda );
206       }
207 /*
208  * Scale current column by its absolute value max entry  -  Update  dia-
209  * diagonal and subdiagonal elements in column  A(iip1:iip1+Mm1-1, jj+1)
210  * and  find local  absolute value max in  that column  (Only  one  pass
211  * through cache for each current column).  This sequence of  operations
212  * could benefit from a specialized blocked implementation.
213  */
214       if( WORK[0] != HPL_rzero )
215          HPL_dscal( Mm1, HPL_rone / WORK[0], Mptr( A, iip1, jj, lda ), 1 );
216 #ifdef HPL_CALL_VSIPL
217 /*
218  * Create the matrix subviews
219  */
220       Av1 = vsip_msubview_d( Av0, PANEL->ii+iip1, PANEL->jj+ICOFF, Mm1, kk+1 );
221       Xv1 = vsip_msubview_d( Xv0, ICOFF,          jj+1,            kk+1,   1 );
222       Yv1 = vsip_msubview_d( Av0, PANEL->ii+iip1, PANEL->jj+jj+1,  Mm1,    1 );
223 
224       vsip_gemp_d( -HPL_rone, Av1, VSIP_MAT_NTRANS, Xv1, VSIP_MAT_NTRANS,
225                    HPL_rone, Yv1 );
226 /*
227  * Destroy the matrix subviews
228  */
229       vsip_mdestroy_d( Yv1 );
230       vsip_mdestroy_d( Xv1 );
231       vsip_mdestroy_d( Av1 );
232 #else
233       HPL_dgemv( HplColumnMajor, HplNoTrans, Mm1, kk+1, -HPL_rone,
234                  Mptr( A, iip1, ICOFF, lda ), lda, Mptr( L1, ICOFF,
235                  jj+1, n0 ), 1, HPL_rone, Mptr( A, iip1, jj+1, lda ),
236                  1 );
237 #endif
238       HPL_dlocmax( PANEL, Mm1, iip1, jj+1, WORK );
239       if( curr != 0 ) { ii = iip1; iip1++; m = Mm1; Mm1--; }
240 
241       Nm1--; jj++; kk++;
242    }
243 /*
244  * Swap and broadcast last row - Scale last column by its absolute value
245  * max entry
246  */
247    HPL_pdmxswp(  PANEL, m, ii, jj, WORK );
248    HPL_dlocswpN( PANEL,    ii, jj, WORK );
249    if( WORK[0] != HPL_rzero )
250       HPL_dscal( Mm1, HPL_rone / WORK[0], Mptr( A, iip1, jj, lda ), 1 );
251 
252 #ifdef HPL_CALL_VSIPL
253 /*
254  * Release the blocks
255  */
256    (void) vsip_blockrelease_d( vsip_mgetblock_d( Xv0 ), VSIP_TRUE );
257    (void) vsip_blockrelease_d( vsip_mgetblock_d( Av0 ), VSIP_TRUE );
258 /*
259  * Destroy the matrix views
260  */
261    (void) vsip_mdestroy_d( Xv0 );
262    (void) vsip_mdestroy_d( Av0 );
263 #endif
264 #ifdef HPL_DETAILED_TIMING
265    HPL_ptimer( HPL_TIMING_PFACT );
266 #endif
267 /*
268  * End of HPL_pdpancrN
269  */
270 }
271