1 /*
2     This file is part of GNU APL, a free implementation of the
3     ISO/IEC Standard 13751, "Programming Language APL, Extended"
4 
5     Copyright (C) 2008-2015  Dr. Jürgen Sauermann
6 
7     This program is free software: you can redistribute it and/or modify
8     it under the terms of the GNU General Public License as published by
9     the Free Software Foundation, either version 3 of the License, or
10     (at your option) any later version.
11 
12     This program is distributed in the hope that it will be useful,
13     but WITHOUT ANY WARRANTY; without even the implied warranty of
14     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15     GNU General Public License for more details.
16 
17     You should have received a copy of the GNU General Public License
18     along with this program.  If not, see <http://www.gnu.org/licenses/>.
19 */
20 
21 #include <assert.h>
22 #include <pthread.h>
23 #include <stdint.h>
24 #include <stdio.h>
25 #include <stdlib.h>
26 #include <string.h>
27 #include <unistd.h>
28 
29 #include <iostream>
30 #include <iomanip>
31 
32 #include "../config.h"
33 
34 using namespace std;
35 
36 int64_t LEN = 420000000;
37 
38 //-----------------------------------------------------------------------------
cycle_counter()39 inline uint64_t cycle_counter()
40 {
41 unsigned int lo, hi;
42    __asm__ __volatile__ ("rdtsc" : "=a" (lo), "=d" (hi));
43    return ((uint64_t)hi << 32) | lo;
44 }
45 //-----------------------------------------------------------------------------
46 struct thread_context
47 {
48   pthread_t  thread;
49   uint64_t   core_num;
50   uint64_t   slice_len;
51   int64_t  * data;
52   uint64_t   t1;
53   uint64_t   t2;
54 };
55 
56 bool goon = false;
57 
58 //-----------------------------------------------------------------------------
59 void *
thread_main_write(void * vp)60 thread_main_write(void * vp)
61 {
62 thread_context & ctx = *(thread_context *)vp;
63 
64    if (ctx.core_num)   // subordinate
65       {
66         while (!goon)   ;   // busy wait for goon
67         ctx.t1 = cycle_counter();
68       }
69    else                // master
70       {
71         usleep(100000);   // allow subordinate to start up
72         ctx.t1 = cycle_counter();
73         goon = true;
74       }
75 
76    // do some work...
77    //
78 const int slice_len = ctx.slice_len;
79 int64_t * data = ctx.data;
80 const int64_t * end = data + slice_len;
81    while (data < end)   *data++ = 42;
82 
83    ctx.t2 = cycle_counter();
84    return 0;
85 }
86 //-----------------------------------------------------------------------------
87 void *
thread_main_read(void * vp)88 thread_main_read(void * vp)
89 {
90 thread_context & ctx = *(thread_context *)vp;
91 
92    if (ctx.core_num)   // subordinate
93       {
94         while (!goon)   ;   // busy wait for goon
95         ctx.t1 = cycle_counter();
96       }
97    else                // master
98       {
99         usleep(100000);   // allow subordinate to start up
100         ctx.t1 = cycle_counter();
101         goon = true;
102       }
103 
104    // do some work...
105    //
106 const int slice_len = ctx.slice_len;
107 int64_t * data = ctx.data;
108 const int64_t * end = data + slice_len;
109 int64_t sum = 0;
110    while (data < end)   sum += *data++;
111 
112    ctx.t2 = cycle_counter();
113    return 0;
114 }
115 //-----------------------------------------------------------------------------
116 void
multi(int cores,int op,int64_t * data)117 multi(int cores, int op, int64_t * data)
118 {
119 thread_context ctx[cores];
120 int64_t slice_len = LEN / cores;
121 const char * opname = "write";
122 void * (*op_fun)(void *) = &thread_main_write;
123 
124    if (op == 1)
125       {
126         opname = "read ";
127         op_fun = &thread_main_read;
128       }
129 
130    for (int c = 0; c < cores; ++c)
131       {
132         ctx[c].core_num = c;
133         ctx[c].data = data + c*slice_len;
134         ctx[c].slice_len = slice_len;
135 
136         ctx[c].thread = pthread_self();
137         if (c)
138            {
139              pthread_attr_t attr;
140              pthread_attr_init(&attr);
141              pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
142              pthread_create(&ctx[c].thread, &attr, op_fun,  ctx + c);
143              pthread_attr_destroy(&attr);
144            }
145 
146         cpu_set_t cpus;
147         CPU_ZERO(&cpus);
148         CPU_SET(c, &cpus);
149         pthread_setaffinity_np(ctx[c].thread, sizeof(cpu_set_t), &cpus);
150       }
151 
152    op_fun(ctx);
153    for (int c = 1; c < cores; ++c)
154       {
155          void * ret;
156         pthread_join(ctx[c].thread, &ret);
157       }
158 
159    goon = false;
160 
161 const uint64_t start = ctx[0].t1;
162 
163    if (0)
164    for (int c = 0; c < cores; ++c)
165        {
166          cerr << "thread " << c << ": t1="
167               << setw(10) << (ctx[c].t1 - start) << " t2= "
168               << setw(10) << (ctx[c].t2 - start) << " duration= "
169               << setw(10) << (ctx[c].t2 - ctx[c].t1) << endl;
170        }
171 
172    // summary
173    //
174 int max = 0;
175    for (int c = 0; c < cores; ++c)
176        {
177          const int diff = ctx[c].t2 - start;
178          if (max < diff)   max = diff;
179        }
180 
181    fprintf(stderr, "%s int64_t: %10d total, %5.2f cycles per %s"
182                    " (on %d cores)\n",
183            opname, max, (cores*(double)max)/LEN, opname, cores);
184 }
185 //-----------------------------------------------------------------------------
186 void
sequential(int op,int64_t * data,bool verbose)187 sequential(int op, int64_t * data, bool verbose)
188 {
189 uint64_t t1;
190 uint64_t t2;
191 const char * opname;
192 
193    if (op == 0)
194       {
195         opname = "write";
196         int64_t * d = data;
197         const int64_t * e = data + LEN;
198 
199         t1 = cycle_counter();
200         while (d < e)   *d++ = 42;
201         t2 = cycle_counter();
202       }
203    else if (op == 1)
204       {
205         opname = "read ";
206         int64_t sum = 1;
207         int64_t * d = data;
208 
209         t1 = cycle_counter();
210         const int64_t * e = data + LEN;
211         while (d < e)   sum += *d++;
212         t2 = cycle_counter();
213       }
214    else assert(0 && "Bad opnum");
215 
216    if (!verbose)   return;
217 
218 
219    fprintf(stderr,
220            "%s int64_t: %10d; total, %5.2f cycles per %s (sequential)\n",
221            opname, (int)(t2 - t1), ((double)(t2 - t1))/LEN, opname);
222 }
223 //-----------------------------------------------------------------------------
224 int
main(int argc,char * argv[])225 main(int argc, char *argv[])
226 {
227 cpu_set_t CPUs;   CPU_ZERO(&CPUs);
228 
229 const int err = pthread_getaffinity_np(pthread_self(), sizeof(CPUs), &CPUs);
230    assert(err == 0);
231 const int CPUs_present = CPU_COUNT(&CPUs);
232 
233    cerr << "running memory benchmark on 1.." << CPUs_present
234         << " cores..." << endl;
235 
236 int64_t * data = new int64_t[LEN];
237 
238    // cache warm-up
239    //
240    sequential(0, data, false);
241    sequential(1, data, false);
242 
243    for (int c = 0; c <= CPUs_present; ++c)
244       {
245         if (c == 0)
246            {
247              sequential(0, data, true);
248              sequential(1, data, true);
249            }
250         else
251            {
252              multi(c, 0, data);
253              multi(c, 1, data);
254            }
255       }
256 }
257 //-----------------------------------------------------------------------------
258