1 /*
2 This file is part of GNU APL, a free implementation of the
3 ISO/IEC Standard 13751, "Programming Language APL, Extended"
4
5 Copyright (C) 2008-2015 Dr. Jürgen Sauermann
6
7 This program is free software: you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation, either version 3 of the License, or
10 (at your option) any later version.
11
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with this program. If not, see <http://www.gnu.org/licenses/>.
19 */
20
21 #include <assert.h>
22 #include <pthread.h>
23 #include <stdint.h>
24 #include <stdio.h>
25 #include <stdlib.h>
26 #include <string.h>
27 #include <unistd.h>
28
29 #include <iostream>
30 #include <iomanip>
31
32 #include "../config.h"
33
34 using namespace std;
35
36 int64_t LEN = 420000000;
37
38 //-----------------------------------------------------------------------------
cycle_counter()39 inline uint64_t cycle_counter()
40 {
41 unsigned int lo, hi;
42 __asm__ __volatile__ ("rdtsc" : "=a" (lo), "=d" (hi));
43 return ((uint64_t)hi << 32) | lo;
44 }
45 //-----------------------------------------------------------------------------
46 struct thread_context
47 {
48 pthread_t thread;
49 uint64_t core_num;
50 uint64_t slice_len;
51 int64_t * data;
52 uint64_t t1;
53 uint64_t t2;
54 };
55
56 bool goon = false;
57
58 //-----------------------------------------------------------------------------
59 void *
thread_main_write(void * vp)60 thread_main_write(void * vp)
61 {
62 thread_context & ctx = *(thread_context *)vp;
63
64 if (ctx.core_num) // subordinate
65 {
66 while (!goon) ; // busy wait for goon
67 ctx.t1 = cycle_counter();
68 }
69 else // master
70 {
71 usleep(100000); // allow subordinate to start up
72 ctx.t1 = cycle_counter();
73 goon = true;
74 }
75
76 // do some work...
77 //
78 const int slice_len = ctx.slice_len;
79 int64_t * data = ctx.data;
80 const int64_t * end = data + slice_len;
81 while (data < end) *data++ = 42;
82
83 ctx.t2 = cycle_counter();
84 return 0;
85 }
86 //-----------------------------------------------------------------------------
87 void *
thread_main_read(void * vp)88 thread_main_read(void * vp)
89 {
90 thread_context & ctx = *(thread_context *)vp;
91
92 if (ctx.core_num) // subordinate
93 {
94 while (!goon) ; // busy wait for goon
95 ctx.t1 = cycle_counter();
96 }
97 else // master
98 {
99 usleep(100000); // allow subordinate to start up
100 ctx.t1 = cycle_counter();
101 goon = true;
102 }
103
104 // do some work...
105 //
106 const int slice_len = ctx.slice_len;
107 int64_t * data = ctx.data;
108 const int64_t * end = data + slice_len;
109 int64_t sum = 0;
110 while (data < end) sum += *data++;
111
112 ctx.t2 = cycle_counter();
113 return 0;
114 }
115 //-----------------------------------------------------------------------------
116 void
multi(int cores,int op,int64_t * data)117 multi(int cores, int op, int64_t * data)
118 {
119 thread_context ctx[cores];
120 int64_t slice_len = LEN / cores;
121 const char * opname = "write";
122 void * (*op_fun)(void *) = &thread_main_write;
123
124 if (op == 1)
125 {
126 opname = "read ";
127 op_fun = &thread_main_read;
128 }
129
130 for (int c = 0; c < cores; ++c)
131 {
132 ctx[c].core_num = c;
133 ctx[c].data = data + c*slice_len;
134 ctx[c].slice_len = slice_len;
135
136 ctx[c].thread = pthread_self();
137 if (c)
138 {
139 pthread_attr_t attr;
140 pthread_attr_init(&attr);
141 pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
142 pthread_create(&ctx[c].thread, &attr, op_fun, ctx + c);
143 pthread_attr_destroy(&attr);
144 }
145
146 cpu_set_t cpus;
147 CPU_ZERO(&cpus);
148 CPU_SET(c, &cpus);
149 pthread_setaffinity_np(ctx[c].thread, sizeof(cpu_set_t), &cpus);
150 }
151
152 op_fun(ctx);
153 for (int c = 1; c < cores; ++c)
154 {
155 void * ret;
156 pthread_join(ctx[c].thread, &ret);
157 }
158
159 goon = false;
160
161 const uint64_t start = ctx[0].t1;
162
163 if (0)
164 for (int c = 0; c < cores; ++c)
165 {
166 cerr << "thread " << c << ": t1="
167 << setw(10) << (ctx[c].t1 - start) << " t2= "
168 << setw(10) << (ctx[c].t2 - start) << " duration= "
169 << setw(10) << (ctx[c].t2 - ctx[c].t1) << endl;
170 }
171
172 // summary
173 //
174 int max = 0;
175 for (int c = 0; c < cores; ++c)
176 {
177 const int diff = ctx[c].t2 - start;
178 if (max < diff) max = diff;
179 }
180
181 fprintf(stderr, "%s int64_t: %10d total, %5.2f cycles per %s"
182 " (on %d cores)\n",
183 opname, max, (cores*(double)max)/LEN, opname, cores);
184 }
185 //-----------------------------------------------------------------------------
186 void
sequential(int op,int64_t * data,bool verbose)187 sequential(int op, int64_t * data, bool verbose)
188 {
189 uint64_t t1;
190 uint64_t t2;
191 const char * opname;
192
193 if (op == 0)
194 {
195 opname = "write";
196 int64_t * d = data;
197 const int64_t * e = data + LEN;
198
199 t1 = cycle_counter();
200 while (d < e) *d++ = 42;
201 t2 = cycle_counter();
202 }
203 else if (op == 1)
204 {
205 opname = "read ";
206 int64_t sum = 1;
207 int64_t * d = data;
208
209 t1 = cycle_counter();
210 const int64_t * e = data + LEN;
211 while (d < e) sum += *d++;
212 t2 = cycle_counter();
213 }
214 else assert(0 && "Bad opnum");
215
216 if (!verbose) return;
217
218
219 fprintf(stderr,
220 "%s int64_t: %10d; total, %5.2f cycles per %s (sequential)\n",
221 opname, (int)(t2 - t1), ((double)(t2 - t1))/LEN, opname);
222 }
223 //-----------------------------------------------------------------------------
224 int
main(int argc,char * argv[])225 main(int argc, char *argv[])
226 {
227 cpu_set_t CPUs; CPU_ZERO(&CPUs);
228
229 const int err = pthread_getaffinity_np(pthread_self(), sizeof(CPUs), &CPUs);
230 assert(err == 0);
231 const int CPUs_present = CPU_COUNT(&CPUs);
232
233 cerr << "running memory benchmark on 1.." << CPUs_present
234 << " cores..." << endl;
235
236 int64_t * data = new int64_t[LEN];
237
238 // cache warm-up
239 //
240 sequential(0, data, false);
241 sequential(1, data, false);
242
243 for (int c = 0; c <= CPUs_present; ++c)
244 {
245 if (c == 0)
246 {
247 sequential(0, data, true);
248 sequential(1, data, true);
249 }
250 else
251 {
252 multi(c, 0, data);
253 multi(c, 1, data);
254 }
255 }
256 }
257 //-----------------------------------------------------------------------------
258