1 /*
2 * Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 *
16 */
17
18 /* exch.c -- total exchange routine */
19
20 #include "stdioInterf.h"
21 #include "fioMacros.h"
22
23 #include "fort_vars.h"
24
25 static int _1 = 1;
26 static chdr *exch[16] = {NULL};
27 static int *rbuf, *sbuf, *vcounts, *vrbuf, *vsbuf;
28 static int rmax, smax, vrmax; /* * * * NOT THREAD SAFE * * * */
29
30 /* counts is an integer array of length 2**ceil(log2 np). For each
31 pair of processors {i,j}, counts[j] on processor i is exchanged
32 with counts[i] on processor j, and the maximum count value is
33 returned. Ceil(log2 np) communication steps are required. If the
34 number of processors is not a power of two, some processors do
35 double duty as "virtual processors". */
36
37 int
__fort_exchange_counts(int * counts)38 __fort_exchange_counts(int *counts)
39 {
40 chdr *c;
41 int cpu, lcpu, i, j, k, l, m, n, tcpus, vcpu, vme;
42 int *tcpus_addr;
43
44 lcpu = GET_DIST_LCPU;
45 tcpus = GET_DIST_TCPUS;
46 tcpus_addr = GET_DIST_TCPUS_ADDR;
47 smax = counts[0]; /* initial maximum */
48 if (tcpus == 1)
49 return smax;
50
51 for (i = tcpus; --i > 0;) {
52 if (counts[i] > smax)
53 smax = counts[i];
54 }
55
56 #if defined(DEBUG)
57 if (__fort_test & DEBUG_EXCH) {
58 for (i = tcpus; i < __fort_np2; ++i)
59 counts[i] = -lcpu;
60 printf("%d exch counts", lcpu);
61 for (i = 0; i < tcpus; ++i)
62 printf(" %d", counts[i]);
63 printf("\n");
64 }
65 #endif
66
67 m = __fort_np2 >> 1; /* message length, also most
68 significant bit of processor number */
69 vme = lcpu ^ m; /* virtual processor number */
70
71 /* one-time setup... */
72
73 if (exch[0] == NULL) {
74
75 /* allocate buffers */
76
77 n = __fort_np2;
78 if (__fort_np2 != tcpus)
79 n *= 3; /* more buffer needed for virtual processor */
80 rbuf = (int *)__fort_gmalloc(n * sizeof(int));
81 sbuf = rbuf + m;
82 if (__fort_np2 != tcpus) {
83 vrbuf = sbuf + m;
84 vsbuf = vrbuf + m;
85 vcounts = vsbuf + m;
86 } else
87 vsbuf = vrbuf = vcounts = NULL;
88
89 /* initialize channels */
90
91 for (n = 0, l = 1; l < tcpus; ++n, l <<= 1) {
92 c = __fort_chn_1to1(NULL, 1, 0, tcpus_addr, &_1, 1, 0, tcpus_addr, &_1);
93 cpu = lcpu ^ l;
94 if (cpu >= tcpus)
95 cpu ^= m;
96 vcpu = vme ^ l;
97 if (vcpu >= tcpus)
98 vcpu ^= m;
99 #if defined(DEBUG)
100 if (__fort_test & DEBUG_EXCH) {
101 printf("%d exch l=%d cpu=%d(%d)\n", lcpu, l, lcpu ^ l, cpu);
102 if (vme >= tcpus) {
103 printf("%d exch l=%d cpu=%d(%d)\n", vme, l, vme ^ l, vcpu);
104 }
105 }
106 #endif
107 __fort_sendl(c, cpu, &smax, 1, 1, __CINT, sizeof(int));
108 __fort_sendl(c, cpu, sbuf, m, 1, __CINT, sizeof(int));
109 if (vme >= tcpus) {
110 __fort_sendl(c, vcpu, &smax, 1, 1, __CINT, sizeof(int));
111 __fort_sendl(c, vcpu, vsbuf, m, 1, __CINT, sizeof(int));
112 if (cpu == lcpu) {
113 __fort_recvl(c, vcpu, &vrmax, 1, 1, __CINT, sizeof(int));
114 __fort_recvl(c, vcpu, vrbuf, m, 1, __CINT, sizeof(int));
115 __fort_recvl(c, cpu, &rmax, 1, 1, __CINT, sizeof(int));
116 __fort_recvl(c, cpu, rbuf, m, 1, __CINT, sizeof(int));
117 } else {
118 __fort_recvl(c, cpu, &rmax, 1, 1, __CINT, sizeof(int));
119 __fort_recvl(c, cpu, rbuf, m, 1, __CINT, sizeof(int));
120 __fort_recvl(c, vcpu, &vrmax, 1, 1, __CINT, sizeof(int));
121 __fort_recvl(c, vcpu, vrbuf, m, 1, __CINT, sizeof(int));
122 }
123 } else {
124 __fort_recvl(c, cpu, &rmax, 1, 1, __CINT, sizeof(int));
125 __fort_recvl(c, cpu, rbuf, m, 1, __CINT, sizeof(int));
126 }
127 __fort_chn_prune(c);
128 exch[n] = c;
129 }
130 }
131
132 #if defined(DEBUG)
133 if (vme >= tcpus) {
134 for (i = 0; i < __fort_np2; ++i)
135 vcounts[i] = -vme;
136 }
137 #endif
138
139 /* do the exchanges */
140
141 for (n = 0, l = 1; l < tcpus; ++n, l <<= 1) {
142 cpu = lcpu ^ l;
143 vcpu = vme ^ l;
144 for (i = cpu & l, j = 0; j < m; i += l) {
145 for (k = j + l; j < k; ++i, ++j) {
146 sbuf[j] = counts[i];
147 }
148 }
149 if (vme >= tcpus) {
150 for (i = vcpu & l, j = 0; j < m; i += l) {
151 for (k = j + l; j < k; ++i, ++j) {
152 vsbuf[j] = vcounts[i];
153 }
154 }
155 }
156 __fort_doit(exch[n]);
157 if (rmax > smax)
158 smax = rmax;
159 for (i = cpu & l, j = 0; j < m; i += l) {
160 for (k = j + l; j < k; ++i, ++j) {
161 counts[i] = rbuf[j];
162 }
163 }
164 if (vme >= tcpus) {
165 if (vrmax > smax)
166 smax = vrmax;
167 for (i = vcpu & l, j = 0; j < m; i += l) {
168 for (k = j + l; j < k; ++i, ++j) {
169 vcounts[i] = vrbuf[j];
170 }
171 }
172 }
173 #if defined(DEBUG)
174 if (__fort_test & DEBUG_EXCH) {
175 printf("%d exch l=%d cpu=%d rmax=%d", lcpu, l, cpu, rmax);
176 for (i = 0; i < __fort_np2; ++i)
177 printf(" %d", counts[i]);
178 printf("\t>");
179 for (i = 0; i < m; ++i)
180 printf(" %d", sbuf[i]);
181 printf("\t<");
182 for (i = 0; i < m; ++i)
183 printf(" %d", rbuf[i]);
184 printf("\n");
185 if (vme >= tcpus) {
186 printf("%d exch l=%d cpu=%d rmax=%d", vme, l, vcpu, vrmax);
187 for (i = 0; i < __fort_np2; ++i)
188 printf(" %d", vcounts[i]);
189 printf("\t>");
190 for (i = 0; i < m; ++i)
191 printf(" %d", vsbuf[i]);
192 printf("\t<");
193 for (i = 0; i < m; ++i)
194 printf(" %d", vrbuf[i]);
195 printf("\n");
196 }
197 }
198 #endif
199 }
200 return smax;
201 }
202