1 /*
2  * Copyright (c) 2017, NVIDIA CORPORATION.  All rights reserved.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *     http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  *
16  */
17 
18 /* exch.c -- total exchange routine */
19 
20 #include "stdioInterf.h"
21 #include "fioMacros.h"
22 
23 #include "fort_vars.h"
24 
25 static int _1 = 1;
26 static chdr *exch[16] = {NULL};
27 static int *rbuf, *sbuf, *vcounts, *vrbuf, *vsbuf;
28 static int rmax, smax, vrmax; /* * * * NOT THREAD SAFE * * * */
29 
30 /* counts is an integer array of length 2**ceil(log2 np).  For each
31    pair of processors {i,j}, counts[j] on processor i is exchanged
32    with counts[i] on processor j, and the maximum count value is
33    returned.  Ceil(log2 np) communication steps are required.  If the
34    number of processors is not a power of two, some processors do
35    double duty as "virtual processors". */
36 
37 int
__fort_exchange_counts(int * counts)38 __fort_exchange_counts(int *counts)
39 {
40   chdr *c;
41   int cpu, lcpu, i, j, k, l, m, n, tcpus, vcpu, vme;
42   int *tcpus_addr;
43 
44   lcpu = GET_DIST_LCPU;
45   tcpus = GET_DIST_TCPUS;
46   tcpus_addr = GET_DIST_TCPUS_ADDR;
47   smax = counts[0]; /* initial maximum */
48   if (tcpus == 1)
49     return smax;
50 
51   for (i = tcpus; --i > 0;) {
52     if (counts[i] > smax)
53       smax = counts[i];
54   }
55 
56 #if defined(DEBUG)
57   if (__fort_test & DEBUG_EXCH) {
58     for (i = tcpus; i < __fort_np2; ++i)
59       counts[i] = -lcpu;
60     printf("%d exch counts", lcpu);
61     for (i = 0; i < tcpus; ++i)
62       printf(" %d", counts[i]);
63     printf("\n");
64   }
65 #endif
66 
67   m = __fort_np2 >> 1; /* message length, also most
68                           significant bit of processor number */
69   vme = lcpu ^ m;     /* virtual processor number */
70 
71   /* one-time setup... */
72 
73   if (exch[0] == NULL) {
74 
75     /* allocate buffers */
76 
77     n = __fort_np2;
78     if (__fort_np2 != tcpus)
79       n *= 3; /* more buffer needed for virtual processor */
80     rbuf = (int *)__fort_gmalloc(n * sizeof(int));
81     sbuf = rbuf + m;
82     if (__fort_np2 != tcpus) {
83       vrbuf = sbuf + m;
84       vsbuf = vrbuf + m;
85       vcounts = vsbuf + m;
86     } else
87       vsbuf = vrbuf = vcounts = NULL;
88 
89     /* initialize channels */
90 
91     for (n = 0, l = 1; l < tcpus; ++n, l <<= 1) {
92       c = __fort_chn_1to1(NULL, 1, 0, tcpus_addr, &_1, 1, 0, tcpus_addr, &_1);
93       cpu = lcpu ^ l;
94       if (cpu >= tcpus)
95         cpu ^= m;
96       vcpu = vme ^ l;
97       if (vcpu >= tcpus)
98         vcpu ^= m;
99 #if defined(DEBUG)
100       if (__fort_test & DEBUG_EXCH) {
101         printf("%d exch l=%d cpu=%d(%d)\n", lcpu, l, lcpu ^ l, cpu);
102         if (vme >= tcpus) {
103           printf("%d exch l=%d cpu=%d(%d)\n", vme, l, vme ^ l, vcpu);
104         }
105       }
106 #endif
107       __fort_sendl(c, cpu, &smax, 1, 1, __CINT, sizeof(int));
108       __fort_sendl(c, cpu, sbuf, m, 1, __CINT, sizeof(int));
109       if (vme >= tcpus) {
110         __fort_sendl(c, vcpu, &smax, 1, 1, __CINT, sizeof(int));
111         __fort_sendl(c, vcpu, vsbuf, m, 1, __CINT, sizeof(int));
112         if (cpu == lcpu) {
113           __fort_recvl(c, vcpu, &vrmax, 1, 1, __CINT, sizeof(int));
114           __fort_recvl(c, vcpu, vrbuf, m, 1, __CINT, sizeof(int));
115           __fort_recvl(c, cpu, &rmax, 1, 1, __CINT, sizeof(int));
116           __fort_recvl(c, cpu, rbuf, m, 1, __CINT, sizeof(int));
117         } else {
118           __fort_recvl(c, cpu, &rmax, 1, 1, __CINT, sizeof(int));
119           __fort_recvl(c, cpu, rbuf, m, 1, __CINT, sizeof(int));
120           __fort_recvl(c, vcpu, &vrmax, 1, 1, __CINT, sizeof(int));
121           __fort_recvl(c, vcpu, vrbuf, m, 1, __CINT, sizeof(int));
122         }
123       } else {
124         __fort_recvl(c, cpu, &rmax, 1, 1, __CINT, sizeof(int));
125         __fort_recvl(c, cpu, rbuf, m, 1, __CINT, sizeof(int));
126       }
127       __fort_chn_prune(c);
128       exch[n] = c;
129     }
130   }
131 
132 #if defined(DEBUG)
133   if (vme >= tcpus) {
134     for (i = 0; i < __fort_np2; ++i)
135       vcounts[i] = -vme;
136   }
137 #endif
138 
139   /* do the exchanges */
140 
141   for (n = 0, l = 1; l < tcpus; ++n, l <<= 1) {
142     cpu = lcpu ^ l;
143     vcpu = vme ^ l;
144     for (i = cpu & l, j = 0; j < m; i += l) {
145       for (k = j + l; j < k; ++i, ++j) {
146         sbuf[j] = counts[i];
147       }
148     }
149     if (vme >= tcpus) {
150       for (i = vcpu & l, j = 0; j < m; i += l) {
151         for (k = j + l; j < k; ++i, ++j) {
152           vsbuf[j] = vcounts[i];
153         }
154       }
155     }
156     __fort_doit(exch[n]);
157     if (rmax > smax)
158       smax = rmax;
159     for (i = cpu & l, j = 0; j < m; i += l) {
160       for (k = j + l; j < k; ++i, ++j) {
161         counts[i] = rbuf[j];
162       }
163     }
164     if (vme >= tcpus) {
165       if (vrmax > smax)
166         smax = vrmax;
167       for (i = vcpu & l, j = 0; j < m; i += l) {
168         for (k = j + l; j < k; ++i, ++j) {
169           vcounts[i] = vrbuf[j];
170         }
171       }
172     }
173 #if defined(DEBUG)
174     if (__fort_test & DEBUG_EXCH) {
175       printf("%d exch l=%d cpu=%d rmax=%d", lcpu, l, cpu, rmax);
176       for (i = 0; i < __fort_np2; ++i)
177         printf(" %d", counts[i]);
178       printf("\t>");
179       for (i = 0; i < m; ++i)
180         printf(" %d", sbuf[i]);
181       printf("\t<");
182       for (i = 0; i < m; ++i)
183         printf(" %d", rbuf[i]);
184       printf("\n");
185       if (vme >= tcpus) {
186         printf("%d exch l=%d cpu=%d rmax=%d", vme, l, vcpu, vrmax);
187         for (i = 0; i < __fort_np2; ++i)
188           printf(" %d", vcounts[i]);
189         printf("\t>");
190         for (i = 0; i < m; ++i)
191           printf(" %d", vsbuf[i]);
192         printf("\t<");
193         for (i = 0; i < m; ++i)
194           printf(" %d", vrbuf[i]);
195         printf("\n");
196       }
197     }
198 #endif
199   }
200   return smax;
201 }
202