1 /*
2  * -- High Performance Computing Linpack Benchmark (HPL)
3  *    HPL - 2.3 - December 2, 2018
4  *    Antoine P. Petitet
5  *    University of Tennessee, Knoxville
6  *    Innovative Computing Laboratory
7  *    (C) Copyright 2000-2008 All Rights Reserved
8  *
9  * -- Copyright notice and Licensing terms:
10  *
11  * Redistribution  and  use in  source and binary forms, with or without
12  * modification, are  permitted provided  that the following  conditions
13  * are met:
14  *
15  * 1. Redistributions  of  source  code  must retain the above copyright
16  * notice, this list of conditions and the following disclaimer.
17  *
18  * 2. Redistributions in binary form must reproduce  the above copyright
19  * notice, this list of conditions,  and the following disclaimer in the
20  * documentation and/or other materials provided with the distribution.
21  *
22  * 3. All  advertising  materials  mentioning  features  or  use of this
23  * software must display the following acknowledgement:
24  * This  product  includes  software  developed  at  the  University  of
25  * Tennessee, Knoxville, Innovative Computing Laboratory.
26  *
27  * 4. The name of the  University,  the name of the  Laboratory,  or the
28  * names  of  its  contributors  may  not  be used to endorse or promote
29  * products  derived   from   this  software  without  specific  written
30  * permission.
31  *
32  * -- Disclaimer:
33  *
34  * THIS  SOFTWARE  IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
35  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,  INCLUDING,  BUT NOT
36  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
37  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY
38  * OR  CONTRIBUTORS  BE  LIABLE FOR ANY  DIRECT,  INDIRECT,  INCIDENTAL,
39  * SPECIAL,  EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES  (INCLUDING,  BUT NOT
40  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
41  * DATA OR PROFITS; OR BUSINESS INTERRUPTION)  HOWEVER CAUSED AND ON ANY
42  * THEORY OF LIABILITY, WHETHER IN CONTRACT,  STRICT LIABILITY,  OR TORT
43  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
44  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
45  * ---------------------------------------------------------------------
46  */
47 /*
48  * Include files
49  */
50 #include "hpl.h"
51 
52 #ifdef STDC_HEADERS
HPL_pdrpanllN(HPL_T_panel * PANEL,const int M,const int N,const int ICOFF,double * WORK)53 void HPL_pdrpanllN
54 (
55    HPL_T_panel *                    PANEL,
56    const int                        M,
57    const int                        N,
58    const int                        ICOFF,
59    double *                         WORK
60 )
61 #else
62 void HPL_pdrpanllN
63 ( PANEL, M, N, ICOFF, WORK )
64    HPL_T_panel *                    PANEL;
65    const int                        M;
66    const int                        N;
67    const int                        ICOFF;
68    double *                         WORK;
69 #endif
70 {
71 /*
72  * Purpose
73  * =======
74  *
75  * HPL_pdrpanllN recursively  factorizes  a panel  of columns using  the
76  * recursive Left-looking variant of the one-dimensional algorithm.  The
77  * lower triangular  N0-by-N0  upper block  of  the  panel  is stored in
78  * no-transpose form (i.e. just like the input matrix itself).
79  *
80  * Bi-directional  exchange  is  used  to  perform  the  swap::broadcast
81  * operations  at once  for one column in the panel.  This  results in a
82  * lower number of slightly larger  messages than usual.  On P processes
83  * and assuming bi-directional links,  the running time of this function
84  * can be approximated by (when N is equal to N0):
85  *
86  *    N0 * log_2( P ) * ( lat + ( 2*N0 + 4 ) / bdwth ) +
87  *    N0^2 * ( M - N0/3 ) * gam2-3
88  *
89  * where M is the local number of rows of  the panel, lat and bdwth  are
90  * the latency and bandwidth of the network for  double  precision  real
91  * words,  and  gam2-3  is an estimate of the  Level 2 and Level 3  BLAS
92  * rate of execution. The  recursive  algorithm  allows indeed to almost
93  * achieve  Level 3 BLAS  performance  in the panel factorization.  On a
94  * large  number of modern machines,  this  operation is however latency
95  * bound,  meaning  that its cost can  be estimated  by only the latency
96  * portion N0 * log_2(P) * lat.  Mono-directional links will double this
97  * communication cost.
98  *
99  * Arguments
100  * =========
101  *
102  * PANEL   (local input/output)          HPL_T_panel *
103  *         On entry,  PANEL  points to the data structure containing the
104  *         panel information.
105  *
106  * M       (local input)                 const int
107  *         On entry,  M specifies the local number of rows of sub(A).
108  *
109  * N       (local input)                 const int
110  *         On entry,  N specifies the local number of columns of sub(A).
111  *
112  * ICOFF   (global input)                const int
113  *         On entry, ICOFF specifies the row and column offset of sub(A)
114  *         in A.
115  *
116  * WORK    (local workspace)             double *
117  *         On entry, WORK  is a workarray of size at least 2*(4+2*N0).
118  *
119  * ---------------------------------------------------------------------
120  */
121 /*
122  * .. Local Variables ..
123  */
124    double                     * A, * Aptr, * L1, * L1ptr;
125 #ifdef HPL_CALL_VSIPL
126    vsip_mview_d               * Av0, * Lv0, * Av1, * Av2, * Lv1;
127 #endif
128    int                        curr, ii, ioff, jb, jj, lda, m, n, n0, nb,
129                               nbdiv, nbmin;
130 /* ..
131  * .. Executable Statements ..
132  */
133    if( N <= ( nbmin = PANEL->algo->nbmin ) )
134    { PANEL->algo->pffun( PANEL, M, N, ICOFF, WORK ); return; }
135 /*
136  * Find  new recursive blocking factor.  To avoid an infinite loop,  one
137  * must guarantee: 1 <= jb < N, knowing that  N  is greater than  NBMIN.
138  * First, we compute nblocks:  the number of blocks of size  NBMIN in N,
139  * including the last one that may be smaller.  nblocks  is thus  larger
140  * than or equal to one, since N >= NBMIN.
141  * The ratio ( nblocks + NDIV - 1 ) / NDIV  is thus larger than or equal
142  * to one as  well.  For  NDIV >= 2,  we  are guaranteed  that the quan-
143  * tity ( ( nblocks + NDIV  - 1 ) / NDIV ) * NBMIN  is less  than N  and
144  * greater than or equal to NBMIN.
145  */
146    nbdiv = PANEL->algo->nbdiv; ii = jj = 0; m = M; n = N;
147    nb = jb = ( (((N+nbmin-1) / nbmin) + nbdiv  - 1) / nbdiv ) * nbmin;
148 
149    A     = PANEL->A;   lda = PANEL->lda;
150    L1    = PANEL->L1;  n0  = PANEL->jb;
151    L1ptr = Mptr( L1, ICOFF, ICOFF, n0 );
152    curr  = (int)( PANEL->grid->myrow == PANEL->prow );
153 
154    if( curr != 0 ) Aptr = Mptr( A, ICOFF, ICOFF, lda );
155    else            Aptr = Mptr( A,     0, ICOFF, lda );
156 /*
157  * The triangular solve is replicated in every  process row.  The  panel
158  * factorization is  such that  the first rows of  A  are accumulated in
159  * every process row during the (panel) swapping phase.  We  ensure this
160  * way a minimum amount  of communication during the entire panel facto-
161  * rization.
162  */
163    do
164    {
165       n -= jb; ioff = ICOFF + jj;
166 /*
167  * Replicated solve - Local update - Factor current panel
168  */
169       HPL_dtrsm( HplColumnMajor, HplLeft, HplLower, HplNoTrans, HplUnit,
170                  jj, jb, HPL_rone, L1ptr, n0, Mptr( L1ptr, 0, jj, n0 ),
171                  n0 );
172 #ifdef HPL_CALL_VSIPL
173 /*
174  * Admit the blocks
175  */
176       (void) vsip_blockadmit_d( PANEL->Ablock,  VSIP_TRUE );
177       (void) vsip_blockadmit_d( PANEL->L1block, VSIP_TRUE );
178 /*
179  * Create the matrix views
180  */
181       Av0 = vsip_mbind_d( PANEL->Ablock,  0, 1, lda, lda, PANEL->pmat->nq );
182       Lv0 = vsip_mbind_d( PANEL->L1block, 0, 1,  n0,  n0, n0              );
183 /*
184  * Create the matrix subviews
185  */
186       if( curr != 0 )
187       {
188          Av1 = vsip_msubview_d( Av0, PANEL->ii+ICOFF+ii, PANEL->jj+ICOFF,
189                                 m, jj );
190          Av2 = vsip_msubview_d( Av0, PANEL->ii+ICOFF+ii, PANEL->jj+ioff,
191                                 m, jj );
192       }
193       else
194       {
195          Av1 = vsip_msubview_d( Av0, PANEL->ii+ii, PANEL->jj+ICOFF, m,  jj );
196          Av2 = vsip_msubview_d( Av0, PANEL->ii+ii, PANEL->jj+ioff,  m,  jj );
197       }
198       Lv1 = vsip_msubview_d( Lv0, ICOFF, ioff, jj, jb );
199 
200       vsip_gemp_d( -HPL_rone, Av1, VSIP_MAT_NTRANS, Lv1, VSIP_MAT_NTRANS,
201                    HPL_rone, Av2 );
202 /*
203  * Destroy the matrix subviews
204  */
205       (void) vsip_mdestroy_d( Lv1 );
206       (void) vsip_mdestroy_d( Av2 );
207       (void) vsip_mdestroy_d( Av1 );
208 /*
209  * Release the blocks
210  */
211       (void) vsip_blockrelease_d( vsip_mgetblock_d( Lv0 ), VSIP_TRUE );
212       (void) vsip_blockrelease_d( vsip_mgetblock_d( Av0 ), VSIP_TRUE );
213 /*
214  * Destroy the matrix views
215  */
216       (void) vsip_mdestroy_d( Lv0 );
217       (void) vsip_mdestroy_d( Av0 );
218 #else
219       HPL_dgemm( HplColumnMajor, HplNoTrans, HplNoTrans, m, jb,
220                  jj, -HPL_rone, Mptr( Aptr, ii, 0, lda ), lda,
221                  Mptr( L1ptr, 0, jj, n0 ), n0, HPL_rone,
222                  Mptr( Aptr, ii, jj, lda ), lda );
223 #endif
224       HPL_pdrpanllN( PANEL, m, jb, ioff, WORK );
225 /*
226  * Copy back upper part of A in current process row - Go the next block
227  */
228       if( curr != 0 )
229       {
230          HPL_dlacpy( ioff, jb, Mptr( L1, 0, ioff, n0 ), n0,
231                      Mptr( A, 0, ioff, lda ), lda );
232          ii += jb; m -= jb;
233       }
234       jj += jb; jb = Mmin( n, nb );
235 
236    } while( n > 0 );
237 /*
238  * End of HPL_pdrpanllN
239  */
240 }
241