1 /* MAT.cpp
2 *
3 * Copyright (C) 2017-2021 Paul Boersma
4 *
5 * This code is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or (at
8 * your option) any later version.
9 *
10 * This code is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
13 * See the GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this work. If not, see <http://www.gnu.org/licenses/>.
17 */
18
19 #include "melder.h"
20 #include "../dwsys/NUM2.h"
21 //#include "../external/gsl/gsl_blas.h"
22
23 #ifdef macintosh
24 #include <Accelerate/Accelerate.h>
25 #undef trace
26 #import <MetalPerformanceShaders/MetalPerformanceShaders.h>
27 #include <OpenCL/opencl.h>
28 #endif
29
centreEachColumn_MAT_inout(MATVU const & x)30 void centreEachColumn_MAT_inout (MATVU const& x) noexcept {
31 for (integer icol = 1; icol <= x.ncol; icol ++) {
32 const double columnMean = NUMmean (x.column (icol));
33 for (integer irow = 1; irow <= x.nrow; irow ++)
34 x [irow] [icol] -= columnMean;
35 }
36 }
37
centreEachRow_MAT_inout(MATVU const & x)38 void centreEachRow_MAT_inout (MATVU const& x) noexcept {
39 for (integer irow = 1; irow <= x.nrow; irow ++)
40 centre_VEC_inout (x [irow]);
41 }
42
doubleCentre_MAT_inout(MATVU const & x)43 void doubleCentre_MAT_inout (MATVU const& x) noexcept {
44 centreEachRow_MAT_inout (x);
45 centreEachColumn_MAT_inout (x);
46 }
47
mtm_MAT_out(MATVU const & target,constMATVU const & x)48 void mtm_MAT_out (MATVU const& target, constMATVU const& x) noexcept {
49 Melder_assert (target.nrow == x.ncol);
50 Melder_assert (target.ncol == x.ncol);
51 #if 0
52 for (integer irow = 1; irow <= target.nrow; irow ++) {
53 for (integer icol = irow; icol <= target.ncol; icol ++) {
54 longdouble t = 0.0;
55 for (integer k = 1; k <= x.nrow; k ++)
56 t += x [k] [irow] * x [k] [icol];
57 target [irow] [icol] = target [icol] [irow] = double (t);
58 }
59 }
60 #elif 0
61 for (integer irow = 1; irow <= target.nrow; irow ++) {
62 for (integer icol = irow; icol <= target.ncol; icol ++) {
63 PAIRWISE_SUM (longdouble, sum, integer, x.nrow,
64 const double *px1 = & x [1] [irow];
65 const double *px2 = & x [1] [icol],
66 longdouble (*px1) * longdouble (*px2),
67 (px1 += x.rowStride, px2 += x.rowStride)
68 )
69 target [irow] [icol] = target [icol] [irow] = double (sum);
70 }
71 }
72 #else
73 for (integer irow = 1; irow <= target.nrow; irow ++)
74 for (integer icol = irow; icol <= target.ncol; icol ++)
75 target [irow] [icol] = 0.0;
76 for (integer k = 1; k <= x.nrow; k ++)
77 for (integer irow = 1; irow <= target.nrow; irow ++)
78 for (integer icol = irow; icol <= target.ncol; icol ++)
79 target [irow] [icol] += x [k] [irow] * x [k] [icol];
80 for (integer irow = 2; irow <= target.nrow; irow ++)
81 for (integer icol = 1; icol < irow; icol ++)
82 target [irow] [icol] = target [icol] [irow];
83 #endif
84 }
85
_mul_MAT_out(MATVU const & target,constMATVU const & x,constMATVU const & y)86 void _mul_MAT_out (MATVU const& target, constMATVU const& x, constMATVU const& y) noexcept {
87 /*
88 Precise matrix multiplication, using pairwise summation.
89 */
90 if (x.colStride == 1) {
91 if (y.rowStride == 1) {
92 /*
93 Appropriate for Target := X.Y',
94 if X and Y are packed row-major matrices.
95 The speed is 0.142, 0.716, 1.32, 1.64, 2.33, 2.22, 2.08, 2.31, 2.18, 1.89, 1.77, 1.53 Gflop/s
96 for size = 1, 3, 10, 20, 50, 100, 200, 500, 1000, 2000, 3000, 5000.
97 */
98 for (integer irow = 1; irow <= target.nrow; irow ++) {
99 for (integer icol = 1; icol <= target.ncol; icol ++) {
100 PAIRWISE_SUM (longdouble, sum, integer, x.ncol,
101 const double *px = & x [irow] [1];
102 const double *py = & y [1] [icol],
103 longdouble (*px) * longdouble (*py),
104 (px += 1, py += 1)
105 )
106 target [irow] [icol] = double (sum);
107 }
108 }
109 } else {
110 /*
111 Appropriate for Target := X.Y,
112 if X and Y are packed row-major matrices.
113 The speed is 0.143, 0.684, 1.20, 1.64, 2.24, 2.04, 1.44, 1.22, 0.56, 0.114 Gflop/s
114 for size = 1, 3, 10, 20, 50, 100, 200, 500, 1000, 2000.
115 */
116 for (integer irow = 1; irow <= target.nrow; irow ++) {
117 for (integer icol = 1; icol <= target.ncol; icol ++) {
118 PAIRWISE_SUM (longdouble, sum, integer, x.ncol,
119 const double *px = & x [irow] [1];
120 const double *py = & y [1] [icol],
121 longdouble (*px) * longdouble (*py),
122 (px += 1, py += y.rowStride)
123 )
124 target [irow] [icol] = double (sum);
125 }
126 }
127 }
128 } else if (y.rowStride == 1) {
129 /*
130 Appropriate for Target := X'.Y',
131 if X and Y are packed row-major matrices.
132 The speed is 0.136, 0.666, 1.22, 1.65, 2.36, 1.96, 1.62, 1.24, 0.69, 0.118 Gflop/s
133 for size = 1, 3, 10, 20, 50, 100, 200, 500, 1000, 2000.
134 */
135 for (integer irow = 1; irow <= target.nrow; irow ++) {
136 for (integer icol = 1; icol <= target.ncol; icol ++) {
137 PAIRWISE_SUM (longdouble, sum, integer, x.ncol,
138 const double *px = & x [irow] [1];
139 const double *py = & y [1] [icol],
140 longdouble (*px) * longdouble (*py),
141 (px += x.colStride, py += 1)
142 )
143 target [irow] [icol] = double (sum);
144 }
145 }
146 } else {
147 /*
148 Appropriate for Target := X'.Y,
149 if X and Y are packed row-major matrices.
150 The speed is 0.143, 0.572, 1.10, 1.43, 1.71, 1.70, 1.29, 0.71, 0.067 Gflop/s
151 for size = 1, 3, 10, 20, 50, 100, 200, 500, 1000.
152 */
153 for (integer irow = 1; irow <= target.nrow; irow ++) {
154 for (integer icol = 1; icol <= target.ncol; icol ++) {
155 PAIRWISE_SUM (longdouble, sum, integer, x.ncol,
156 const double *px = & x [irow] [1];
157 const double *py = & y [1] [icol],
158 longdouble (*px) * longdouble (*py),
159 (px += x.colStride, py += y.rowStride)
160 )
161 target [irow] [icol] = double (sum);
162 }
163 }
164 }
165 }
166
_mul_forceAllocation_MAT_out(MATVU const & target,constMATVU x,constMATVU y)167 void _mul_forceAllocation_MAT_out (MATVU const& target, constMATVU x, constMATVU y) {
168 /*
169 As seen above, the only multiplication that stays fast for large sizes,
170 if X and Y are packed row-major matrices, is X.Y';
171 this is because both x.colStride and y.rowStride are 1 in that case.
172 It may therefore be useful to convert any matrix X that has
173 a column stride unequal to 1, and any matrix Y that has a row stride
174 unequal to 1, to matrices that do have these desirable properties.
175
176 For the X.Y case, where X and Y are packed row-major matrices,
177 the speed is 0.084, 0.124, 0.91, 1.56, 2.26, 2.18, 2.12, 2.25, 2.23, 1.85, 1.78, 1.57 Gflop/s
178 for size = 1, 3, 10, 20, 50, 100, 200, 500, 1000, 2000, 3000, 5000.
179
180 For the X.Y' case, where X and Y are packed row-major matrices, there is no allocation, and
181 the speed is 0.084, 0.610, 1.26, 1.69, 2.32, 2.20, 2.12, 2.28, 2.24, 1.91, 1.76, 1.53 Gflop/s
182 for size = 1, 3, 10, 20, 50, 100, 200, 500, 1000, 2000, 3000, 5000.
183
184 For the X'.Y case, where X and Y are packed row-major matrices,
185 the speed is 0.082, 0.068, 0.73, 1.42, 2.20, 2.14, 2.09, 2.27, 2.21, 1.84, 1.77, 1.53 Gflop/s
186 for size = 1, 3, 10, 20, 50, 100, 200, 500, 1000, 2000, 3000, 5000.
187
188 For the X'.Y' case, where X and Y are packed row-major matrices,
189 the speed is 0.082, 0.117, 0.90, 1.57, 2.25, 2.19, 2.12, 2.23, 2.09, 1.92, 1.69, 1.48 Gflop/s
190 for size = 1, 3, 10, 20, 50, 100, 200, 500, 1000, 2000, 3000, 5000.
191 */
192 autoMAT tmpX, tmpY; // the scope shall extend to the end of the function
193 if (x.colStride != 1) {
194 tmpX = copy_MAT (x);
195 x = tmpX.all();
196 Melder_assert (x.colStride == 1);
197 }
198 if (y.rowStride != 1) {
199 tmpY = transpose_MAT (y);
200 y = tmpY.transpose();
201 Melder_assert (y.rowStride == 1);
202 }
203 for (integer irow = 1; irow <= target.nrow; irow ++) {
204 for (integer icol = 1; icol <= target.ncol; icol ++) {
205 PAIRWISE_SUM (longdouble, sum, integer, x.ncol,
206 const double *px = & x [irow] [1];
207 const double *py = & y [1] [icol],
208 longdouble (*px) * longdouble (*py),
209 (px += 1, py += 1)
210 )
211 target [irow] [icol] = double (sum);
212 }
213 }
214 }
215
_mul_allowAllocation_MAT_out(MATVU const & target,constMATVU x,constMATVU y)216 void _mul_allowAllocation_MAT_out (MATVU const& target, constMATVU x, constMATVU y) {
217 /*
218 The faster of _mul_MAT_out and _mul_forceAllocation_MAT_out.
219 Allocation takes place only for larger matrices, e.g. from size 47 on
220 (100,000 flops).
221
222 For the X.Y case, where X and Y are packed row-major matrices,
223 the speed is 0.204, 1.130, 3.74, 3.47, 4.77, 3.81, 4.05, 4.26, 3.92, 2.82, 2.83, 2.80 Gflop/s on 2.9 GHz i9
224 the speed is 0.158, 1.452, 3.70, 4.40, 6.01, 4.55, 4.56, 4.76, 4.84, 3.23, 3.23, 3.26 Gflop/s with SSE 4.2
225 the speed is 0.135, 1.483, 3.85, 4.80, 5.60, 4.49, 4.43, 4.52, 5.05, 3.31, 3.19, 3.25 Gflop/s with AVX2
226 the speed is 0.087, 0.574, 1.18, 1.61, 2.25, 2.14, 2.11, 2.23, 2.23, 1.88, 1.74, 1.53 Gflop/s
227 for size = 1, 3, 10, 20, 50, 100, 200, 500, 1000, 2000, 3000, 5000.
228
229 For the X.Y' case, where X and Y are packed row-major matrices, there is never allocation, and
230 the speed is 0.088, 0.577, 1.28, 1.67, 2.27, 2.18, 2.12, 2.28, 2.20, 1.96, 1.78, 1.57 Gflop/s
231 for size = 1, 3, 10, 20, 50, 100, 200, 500, 1000, 2000, 3000, 5000.
232
233 For the X'.Y case, where X and Y are packed row-major matrices,
234 the speed is 0.084, 0.547, 1.12, 1.44, 2.20, 2.15, 2.04, 2.25, 2.18, 1.92, 1.74, 1.48 Gflop/s
235 for size = 1, 3, 10, 20, 50, 100, 200, 500, 1000, 2000, 3000, 5000.
236
237 For the X'.Y' case, where X and Y are packed row-major matrices,
238 the speed is 0.084, 0.553, 1.18, 1.63, 2.31, 2.12, 2.12, 2.25, 2.16, 1.90, 1.79, 1.50 Gflop/s
239 for size = 1, 3, 10, 20, 50, 100, 200, 500, 1000, 2000, 3000, 5000.
240 */
241 if (x.colStride == 1) {
242 if (y.rowStride == 1) {
243 for (integer irow = 1; irow <= target.nrow; irow ++) {
244 for (integer icol = 1; icol <= target.ncol; icol ++) {
245 PAIRWISE_SUM (longdouble, sum, integer, x.ncol,
246 const double *px = & x [irow] [1];
247 const double *py = & y [1] [icol],
248 longdouble (*px) * longdouble (*py),
249 (px += 1, py += 1)
250 )
251 target [irow] [icol] = double (sum);
252 }
253 }
254 } else {
255 if (double (target.nrow) * double (target.ncol) * double (x.ncol) > 1e5) {
256 autoMAT tmpY = transpose_MAT (y);
257 y = tmpY.transpose();
258 Melder_assert (y.rowStride == 1);
259 for (integer irow = 1; irow <= target.nrow; irow ++) {
260 for (integer icol = 1; icol <= target.ncol; icol ++) {
261 PAIRWISE_SUM (longdouble, sum, integer, x.ncol,
262 const double *px = & x [irow] [1];
263 const double *py = & y [1] [icol],
264 longdouble (*px) * longdouble (*py),
265 (px += 1, py += 1)
266 )
267 target [irow] [icol] = double (sum);
268 }
269 }
270 } else {
271 for (integer irow = 1; irow <= target.nrow; irow ++) {
272 for (integer icol = 1; icol <= target.ncol; icol ++) {
273 PAIRWISE_SUM (longdouble, sum, integer, x.ncol,
274 const double *px = & x [irow] [1];
275 const double *py = & y [1] [icol],
276 longdouble (*px) * longdouble (*py),
277 (px += 1, py += y.rowStride)
278 )
279 target [irow] [icol] = double (sum);
280 }
281 }
282 }
283 }
284 } else if (y.rowStride == 1) {
285 if (double (target.nrow) * double (target.ncol) * double (x.ncol) > 1e5) {
286 autoMAT tmpX = copy_MAT (x);
287 x = tmpX.all();
288 Melder_assert (x.colStride == 1);
289 for (integer irow = 1; irow <= target.nrow; irow ++) {
290 for (integer icol = 1; icol <= target.ncol; icol ++) {
291 PAIRWISE_SUM (longdouble, sum, integer, x.ncol,
292 const double *px = & x [irow] [1];
293 const double *py = & y [1] [icol],
294 longdouble (*px) * longdouble (*py),
295 (px += 1, py += 1)
296 )
297 target [irow] [icol] = double (sum);
298 }
299 }
300 } else {
301 for (integer irow = 1; irow <= target.nrow; irow ++) {
302 for (integer icol = 1; icol <= target.ncol; icol ++) {
303 PAIRWISE_SUM (longdouble, sum, integer, x.ncol,
304 const double *px = & x [irow] [1];
305 const double *py = & y [1] [icol],
306 longdouble (*px) * longdouble (*py),
307 (px += x.colStride, py += 1)
308 )
309 target [irow] [icol] = double (sum);
310 }
311 }
312 }
313 } else {
314 if (double (target.nrow) * double (target.ncol) * double (x.ncol) > 1e5) {
315 autoMAT tmpX = copy_MAT (x);
316 x = tmpX.all();
317 Melder_assert (x.colStride == 1);
318 autoMAT tmpY = transpose_MAT (y);
319 y = tmpY.transpose();
320 Melder_assert (y.rowStride == 1);
321 for (integer irow = 1; irow <= target.nrow; irow ++) {
322 for (integer icol = 1; icol <= target.ncol; icol ++) {
323 PAIRWISE_SUM (longdouble, sum, integer, x.ncol,
324 const double *px = & x [irow] [1];
325 const double *py = & y [1] [icol],
326 longdouble (*px) * longdouble (*py),
327 (px += 1, py += 1)
328 )
329 target [irow] [icol] = double (sum);
330 }
331 }
332 } else {
333 for (integer irow = 1; irow <= target.nrow; irow ++) {
334 for (integer icol = 1; icol <= target.ncol; icol ++) {
335 PAIRWISE_SUM (longdouble, sum, integer, x.ncol,
336 const double *px = & x [irow] [1];
337 const double *py = & y [1] [icol],
338 longdouble (*px) * longdouble (*py),
339 (px += x.colStride, py += y.rowStride)
340 )
341 target [irow] [icol] = double (sum);
342 }
343 }
344 }
345 }
346 }
347
MATmul_rough_naiveReferenceImplementation(MATVU const & target,constMATVU const & x,constMATVU const & y)348 static inline void MATmul_rough_naiveReferenceImplementation (MATVU const& target, constMATVU const& x, constMATVU const& y) noexcept {
349 /*
350 If x.colStride == size and y.colStride == 1,
351 this version is 0.073, 1.32, 1.17, 0.58 Gflop/s for size = 1,10,100,1000.
352 */
353 for (integer irow = 1; irow <= target.nrow; irow ++) {
354 for (integer icol = 1; icol <= target.ncol; icol ++) {
355 target [irow] [icol] = 0.0;
356 for (integer i = 1; i <= x.ncol; i ++)
357 target [irow] [icol] += x [irow] [i] * y [i] [icol];
358 }
359 }
360 }
_mul_fast_MAT_out(MATVU const & target,constMATVU const & x,constMATVU const & y)361 void _mul_fast_MAT_out (MATVU const& target, constMATVU const& x, constMATVU const& y) noexcept {
362 if ((false)) {
363 MATmul_rough_naiveReferenceImplementation (target, x, y);
364 } else if (y.colStride == 1) {
365 /*
366 This case is appropriate for the multiplication of full matrices
367 X.Y
368 or
369 X'.Y
370
371 The speed for X.Y is 0.053, 1.37, 3.14, 2.99, 2.38, 2.06, 1.70 Gflop/s
372 for size = 1, 10, 100, 1000, 2000, 3000, 5000.
373 The speed for X'.Y is 0.063, 1.37, 3.11, 2.72 Gflop/s for size = 1,10,100,1000.
374
375 The trick is to have the inner loop run along two fastest indices;
376 for target as well as y, this fastest index is the last index.
377 Note that the multiplication factor within the inner loop is constant,
378 so we move it out of the loop (by hand, in case the compiler doesn't do it).
379 */
380 #if 1
381 for (integer irow = 1; irow <= target.nrow; irow ++) {
382 VECVU const targetrow = target [irow];
383 for (integer icol = 1; icol <= target.ncol; icol ++)
384 targetrow [icol] = 0.0;
385 for (integer i = 1; i <= x.ncol; i ++) {
386 const double xcell = x [irow] [i];
387 constVECVU const yrow = y [i];
388 for (integer icol = 1; icol <= target.ncol; icol ++)
389 targetrow [icol] += xcell * yrow [icol];
390 }
391 }
392 #elif 0
393 /*
394 Using 64-bit BLAS from Apple's Accelerate framework.
395 The speed for X.Y is 0.037, 1.51, 3.40, 2.40, 1.82, 1.64, 1.04 Gflop/s
396 for size = 1, 10, 100, 1000, 2000, 3000, 5000.
397 This is not really faster than our own simple implementation
398 (perhaps 32-bit BLAS is faster, because it can use more flops per cycle).
399 */
400 cblas_dgemm (CblasRowMajor, CblasNoTrans, CblasNoTrans,
401 target.nrow, target.ncol, x.ncol,
402 1.0,
403 & x [1] [1], x.rowStride,
404 & y [1] [1], y.rowStride,
405 0.0,
406 & target [1] [1], target.rowStride
407 );
408 #else
409 /*
410 An implementation that is notationally ideal.
411 Does the compiler manage to move the constant parts
412 of the expression outside the loop?
413
414 The speed for X.Y is 0.056, 1.08, 2.99, 2.87 Gflop/s for size = 1,10,100,1000.
415 */
416 for (integer irow = 1; irow <= target.nrow; irow ++) {
417 for (integer icol = 1; icol <= target.ncol; icol ++)
418 target [irow] [icol] = 0.0;
419 for (integer i = 1; i <= x.ncol; i ++)
420 for (integer icol = 1; icol <= target.ncol; icol ++)
421 target [irow] [icol] += x [irow] [i] * y [i] [icol];
422 }
423 #endif
424 } else if (y.rowStride == 1) {
425 if (x.colStride == 1) {
426 /*
427 This case will be appropriate for the multiplication of full matrices
428 X.Y'
429 The speed is 0.064, 1.18, 1.67, 1.69 Gflop/s for size = 1,10,100,1000.
430 */
431 _mul_MAT_out (target, x, y);
432 } else {
433 /*
434 This case will be appropriate for the multiplication of full matrices
435 X'.Y'
436
437 So we will make this fast by making the target matrix column-major.
438 The speed will be 0.065, 1.27, 1.45, 1.21 Gflop/s for size = 1,10,100,1000.
439 However, this will work only once an automatrix has row and column strides;
440 until that time we cannot really modify the structure of `target`.
441
442 For the moment, the target has to stay row-major.
443 The speed is 0.064, 1.21, 1.41, 0.43 Gflop/s for size = 1,10,100,1000.
444
445 The trick is to have the inner loop run along two fastest indices;
446 for both target (in future) and y, this fastest index is the first index.
447 */
448 //target.rowStride = 1;
449 //target.colStride = target.nrow;
450 for (integer icol = 1; icol <= target.ncol; icol ++) {
451 VECVU const targetcolumn = target.column (icol);
452 for (integer irow = 1; irow <= target.nrow; irow ++)
453 targetcolumn [irow] = 0.0;
454 for (integer i = 1; i <= x.ncol; i ++) {
455 constVECVU const ycolumn = y.column (i);
456 for (integer irow = 1; irow <= target.nrow; irow ++)
457 targetcolumn [irow] += x [irow] [i] * ycolumn [irow];
458 }
459 }
460 }
461 } else {
462 /*
463 A rare case: the strides of y are both greater than 1.
464 We do not bother to optimize these cases yet.
465 */
466 MATmul_rough_naiveReferenceImplementation (target, x, y);
467 }
468 }
469
MATmul_forceMetal_(MATVU const & target,constMATVU const & x,constMATVU const & y)470 void MATmul_forceMetal_ (MATVU const& target, constMATVU const& x, constMATVU const& y) {
471 #ifdef macintosh
472 if (@available (macOS 10.13, *)) {
473 /*
474 The speed is 0.000'005, 0.004, 1.86, 17.1, 13.8, 58, 41.4, 76, 42.5, 88, 63, 120, 255, 337, 525, 577, 620, 734, 769, 798, 894, 857, 868, 900, 872, 875, 956, 914 Gflop/s AMD Radeon Pro Vega 20 (4 GB)
475 The speed is 0.000'007, 0.006, 2.37, 20.3, 15.7, 42, 35.7, 73, 54.3, 90, 72, 118, 245, 349, 575, 667, 1024, 1028, 1090, 1175, 1220, 1206, 1249, 1285, 1265, 1260, 1302, 1311, 1336, 1192, 1226 Gflop/s AMD Radeon RX Vega 56 (BlackMagic, 8 GB)
476 for size = 1, 10, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 2000, 3000, 5000, 7000, 10000, 12000, 15000, 17000, 18000, 18500, 18700, 18800, 18900, 19000, 20000, 21000, 22000, 25000, 30000.
477 */
478 static bool gpuInited = false;
479 static id <MTLDevice> gpuDevice;
480 static id <MTLCommandQueue> gpuQueue;
481 if (! gpuInited) {
482 NSArray <id<MTLDevice>> *gpuDeviceList = MTLCopyAllDevices ();
483 NSUInteger numberOfGpuDevices = [gpuDeviceList count];
484 Melder_casual (U"Found ", numberOfGpuDevices, U" GPU devices.");
485 if (numberOfGpuDevices < 2) {
486 /*
487 Easy choice.
488 */
489 gpuDevice = MTLCreateSystemDefaultDevice ();
490 } else {
491 int externalGpuDeviceNumber = -1, discreteGpuDeviceNumber = -1;
492 for (NSUInteger idevice = 0; idevice < numberOfGpuDevices; idevice ++) {
493 id <MTLDevice> device = [gpuDeviceList objectAtIndex: idevice];
494 autostring32 deviceName = Melder_8to32 ([[device name] UTF8String]);
495 Melder_casual (U"GPU device ", idevice, U": ", deviceName.get());
496 if (device. removable)
497 externalGpuDeviceNumber = int (idevice);
498 else if (! device. lowPower)
499 discreteGpuDeviceNumber = int (idevice);
500 }
501 if (externalGpuDeviceNumber != -1)
502 gpuDevice = [gpuDeviceList objectAtIndex: NSUInteger (externalGpuDeviceNumber)];
503 else if (discreteGpuDeviceNumber != -1)
504 gpuDevice = [gpuDeviceList objectAtIndex: NSUInteger (discreteGpuDeviceNumber)];
505 else
506 gpuDevice = MTLCreateSystemDefaultDevice (); // unlikely fallback
507 }
508 autostring32 deviceName = Melder_8to32 ([[gpuDevice name] UTF8String]);
509 Melder_casual (U"GPU device for computing: ", deviceName.get());
510 gpuInited = true;
511 gpuQueue = [gpuDevice newCommandQueue];
512 }
513 //Melder_casual (U"start ", Melder_stopwatch ());
514 MPSMatrixMultiplication *matrixMultiplication = [[MPSMatrixMultiplication alloc]
515 initWithDevice: gpuDevice
516 resultRows: integer_to_uinteger (target.nrow)
517 resultColumns: integer_to_uinteger (target.ncol)
518 interiorColumns: integer_to_uinteger (x.ncol)
519 ];
520 Melder_assert (matrixMultiplication != nil);
521
522 uinteger xRowStrideInBytes = [MPSMatrixDescriptor rowBytesForColumns: uinteger (x.ncol) dataType: MPSDataTypeFloat32];
523 uinteger xRowStrideInFloats = xRowStrideInBytes / sizeof (float);
524 autovector <float> x32 = newvectorzero <float> (integer (uinteger (x.nrow) * xRowStrideInFloats));
525 for (integer irow = 1; irow <= x.nrow; irow ++) {
526 float *prow = & x32 [1] + uinteger (irow - 1) * xRowStrideInFloats;
527 for (integer icol = 1; icol <= x.ncol; icol ++)
528 *prow ++ = float (x [irow] [icol]);
529 }
530
531 uinteger yRowStrideInBytes = [MPSMatrixDescriptor rowBytesForColumns: uinteger (y.ncol) dataType: MPSDataTypeFloat32];
532 uinteger yRowStrideInFloats = yRowStrideInBytes / sizeof (float);
533 autovector <float> y32 = newvectorzero <float> (integer (uinteger (y.nrow) * yRowStrideInFloats));
534 for (integer irow = 1; irow <= y.nrow; irow ++) {
535 float *prow = & y32 [1] + uinteger (irow - 1) * yRowStrideInFloats;
536 for (integer icol = 1; icol <= y.ncol; icol ++)
537 *prow ++ = float (y [irow] [icol]);
538 }
539
540 uinteger targetRowStrideInBytes = [MPSMatrixDescriptor rowBytesForColumns: uinteger (y.ncol) dataType: MPSDataTypeFloat32];
541 uinteger targetRowStrideInFloats = targetRowStrideInBytes / sizeof (float);
542 autovector <float> target32 = newvectorzero <float> (integer (uinteger (target.nrow) * targetRowStrideInFloats));
543
544 uinteger x32length = uinteger (x.nrow) * xRowStrideInBytes;
545 id <MTLBuffer> bufferX = [gpuDevice newBufferWithBytes: & x32 [1] length: x32length options: MTLResourceStorageModeManaged];
546 Melder_assert (bufferX != nil);
547
548 uinteger y32length = uinteger (y.nrow) * yRowStrideInBytes;
549 id <MTLBuffer> bufferY = [gpuDevice newBufferWithBytes: & y32 [1] length: y32length options: MTLResourceStorageModeManaged];
550 Melder_assert (bufferY != nil);
551
552 uinteger target32length = uinteger (target.nrow) * targetRowStrideInBytes;
553 id <MTLBuffer> bufferTarget = [gpuDevice newBufferWithBytes: & target32 [1] length: target32length options: MTLResourceStorageModeShared];
554 Melder_assert (bufferTarget != nil);
555 //Melder_casual (U"to GPU ", Melder_stopwatch ());
556
557 MPSMatrixDescriptor *descriptorX =
558 [MPSMatrixDescriptor matrixDescriptorWithRows: uinteger (x.nrow)
559 columns: uinteger (x.ncol)
560 rowBytes: xRowStrideInBytes
561 dataType: MPSDataTypeFloat32];
562 MPSMatrixDescriptor *descriptorY =
563 [MPSMatrixDescriptor matrixDescriptorWithRows: uinteger (y.nrow)
564 columns: uinteger (y.ncol)
565 rowBytes: yRowStrideInBytes
566 dataType: MPSDataTypeFloat32];
567 MPSMatrixDescriptor *descriptorTarget =
568 [MPSMatrixDescriptor matrixDescriptorWithRows: uinteger (target.nrow)
569 columns: uinteger (target.ncol)
570 rowBytes: targetRowStrideInBytes
571 dataType: MPSDataTypeFloat32];
572
573 MPSMatrix *mpsX = [[MPSMatrix alloc] initWithBuffer: bufferX descriptor: descriptorX];
574 Melder_assert (mpsX != nil);
575 MPSMatrix *mpsY = [[MPSMatrix alloc] initWithBuffer: bufferY descriptor: descriptorY];
576 Melder_assert (mpsY != nil);
577 MPSMatrix *mpsTarget = [[MPSMatrix alloc] initWithBuffer: bufferTarget descriptor: descriptorTarget];
578 Melder_assert (mpsTarget != nil);
579
580 NSAutoreleasePool *pool = [[NSAutoreleasePool alloc] init];
581 id <MTLCommandBuffer> commandBuffer = [gpuQueue commandBuffer]; // autoreleased
582 [matrixMultiplication encodeToCommandBuffer: commandBuffer
583 leftMatrix: mpsX
584 rightMatrix: mpsY
585 resultMatrix: mpsTarget];
586 [commandBuffer commit];
587 /*
588 For testing.
589 */
590 constexpr integer numberOfTimes = 0;
591 id <MTLCommandBuffer> commandBuffers [numberOfTimes];
592 for (integer itime = 0; itime < numberOfTimes; itime ++) {
593 commandBuffers [itime] = [gpuQueue commandBuffer];
594 [matrixMultiplication encodeToCommandBuffer: commandBuffers [itime]
595 leftMatrix: mpsX
596 rightMatrix: mpsTarget
597 resultMatrix: mpsY];
598 [matrixMultiplication encodeToCommandBuffer: commandBuffers [itime]
599 leftMatrix: mpsX
600 rightMatrix: mpsY
601 resultMatrix: mpsTarget];
602 [commandBuffers [itime] commit];
603 }
604 [commandBuffer waitUntilCompleted];
605 for (integer itime = 0; itime < numberOfTimes; itime ++) {
606 [commandBuffers [itime] waitUntilCompleted];
607 }
608 //Melder_casual (U"in GPU ", Melder_stopwatch ());
609 NSError *error = [commandBuffer error];
610 if (error) {
611 /*
612 Save the error messages before the release of `commandBuffer` invalidates `error`.
613 */
614 autostring32 localizedDescription = Melder_8to32 ([[error localizedDescription] UTF8String]);
615 autostring32 localizedFailureReason = Melder_8to32 ([[error localizedFailureReason] UTF8String]);
616 autostring32 localizedRecoverySuggestion = Melder_8to32 ([[error localizedRecoverySuggestion] UTF8String]);
617 [bufferX release];
618 [bufferY release];
619 [bufferTarget release];
620 [matrixMultiplication release];
621 [mpsX release];
622 [mpsY release];
623 [mpsTarget release];
624 [pool release]; // this releases `commandBuffer`
625 Melder_throw (U"Matrix multiplication in Metal: Error during execution: ",
626 localizedDescription.get(), localizedFailureReason.get(), localizedRecoverySuggestion.get());
627 }
628 [error release];
629 const float * const rawPointer = (const float *) [bufferTarget contents];
630 for (integer irow = 1; irow <= target.nrow; irow ++) {
631 const float * prow = rawPointer + uinteger (irow - 1) * targetRowStrideInFloats;
632 for (integer icol = 1; icol <= target.ncol; icol ++) {
633 const double value = double (*prow ++);
634 target [irow] [icol] = value;
635 }
636 }
637 //Melder_casual (U"from GPU ", Melder_stopwatch ());
638 [bufferX release];
639 [bufferY release];
640 [bufferTarget release];
641 [matrixMultiplication release];
642 [mpsX release];
643 [mpsY release];
644 [mpsTarget release];
645 //[descriptorX release]; // apparently the MPSMatrix objects have become owners?
646 //[descriptorY release];
647 //[descriptorTarget release];
648 [pool release]; // this releases `commandBuffer`
649 /*
650 Check the result.
651 */
652 //return;
653 const integer numberOfChecks = Melder_iround (pow (target.nrow * target.ncol, 0.33333));
654 integer numberOfUnexpectedZeroes = 0;
655 for (integer icheck = 1; icheck <= numberOfChecks; icheck ++) {
656 const integer rowNumber = NUMrandomInteger (1, target.nrow);
657 const integer columnNumber = NUMrandomInteger (1, target.ncol);
658 const double targetValue = target [rowNumber] [columnNumber];
659 double checkedValue = 0.0;
660 for (integer i = 1; i <= x.ncol; i ++)
661 checkedValue += x [rowNumber] [i] * y [i] [columnNumber];
662 if (checkedValue != 0.0) {
663 //Melder_require (targetValue != 0.0,
664 // U"GPU matrix multiplication incorrect: unexpected zero at row ", rowNumber, U" and column ", columnNumber, U": value should be ", checkedValue, U".");
665 if (targetValue == 0.0) {
666 numberOfUnexpectedZeroes ++;
667 } else {
668 const double relativeError = fabs (checkedValue / targetValue - 1.0);
669 if (relativeError > 10.0) Melder_casual
670 (U"GPU matrix multiplication warning: unexpected imprecision of ", relativeError, U".");
671 }
672 }
673 }
674 if (numberOfUnexpectedZeroes > 0) Melder_casual
675 (U"GPU matrix multiplication warning: found ", numberOfUnexpectedZeroes, U" unexpected zeroes, out of ", numberOfChecks, U".");
676 return;
677 }
678 #else
679 mul_MAT_out (target, x, y);
680 #endif
681 }
682
MATmul_forceOpenCL_(MATVU const & target,constMATVU const & x,constMATVU const & y)683 void MATmul_forceOpenCL_ (MATVU const& target, constMATVU const& x, constMATVU const& y) {
684 #ifdef macintosh
685 static bool gpuInited = false;
686 static cl_context openclContext = nullptr;
687 static cl_command_queue commandQueue = nullptr;
688 static cl_kernel kernel = nullptr;
689 if (! gpuInited) {
690 cl_platform_id platformIDs = nullptr;
691 cl_uint numberOfPlatforms;
692 cl_int status = clGetPlatformIDs (1, & platformIDs, & numberOfPlatforms);
693 cl_device_id deviceIDs = nullptr;
694 cl_uint numberOfDevices;
695 status = clGetDeviceIDs (platformIDs, CL_DEVICE_TYPE_GPU, 1, & deviceIDs, & numberOfDevices);
696 openclContext = clCreateContext (nullptr, 1, & deviceIDs, nullptr, nullptr, & status);
697 commandQueue = clCreateCommandQueue (openclContext, deviceIDs, 0, & status);
698 conststring8 sourceText = "__kernel void vector_add(__global const int *A, __global const int *B, __global int *C) {"
699
700 /* Get the index of the current element to be processed */
701 "int i = get_global_id(0);"
702
703 /* Do the operation */
704 "C[i] = A[i] + B[i];"
705 "}";
706 size_t sourceSize = strlen (sourceText);
707 cl_program program = clCreateProgramWithSource (openclContext, 1,
708 (const char **) & sourceText, (const size_t *) & sourceSize, & status);
709 status = clBuildProgram (program, 1, & deviceIDs, nullptr, nullptr, nullptr);
710 kernel = clCreateKernel (program, "MATmul_opencl", & status);
711 }
712 #if 0
713 // Create memory buffers on the device for each vector
714 cl_mem a_mem_obj = clCreateBuffer (context, CL_MEM_READ_ONLY,
715 LIST_SIZE * sizeof(int), NULL, &ret);
716 cl_mem b_mem_obj = clCreateBuffer (context, CL_MEM_READ_ONLY,
717 LIST_SIZE * sizeof(int), NULL, &ret);
718 cl_mem c_mem_obj = clCreateBuffer (context, CL_MEM_WRITE_ONLY,
719 LIST_SIZE * sizeof(int), NULL, &ret);
720 // Copy the lists A and B to their respective memory buffers
721 ret = clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0,
722 LIST_SIZE * sizeof(int), A, 0, NULL, NULL);
723 ret = clEnqueueWriteBuffer(command_queue, b_mem_obj, CL_TRUE, 0,
724 LIST_SIZE * sizeof(int), B, 0, NULL, NULL);
725 // Set the arguments of the kernel
726 ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&a_mem_obj);
727 ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&b_mem_obj);
728 ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&c_mem_obj);
729
730 // Execute the OpenCL kernel on the list
731 size_t global_item_size = LIST_SIZE; // Process the entire lists
732 size_t local_item_size = 64; // Divide work items into groups of 64
733 ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL,
734 &global_item_size, &local_item_size, 0, NULL, NULL);
735
736 // Read the memory buffer C on the device to the local variable C
737 int *C = (int*)malloc(sizeof(int)*LIST_SIZE);
738 ret = clEnqueueReadBuffer(command_queue, c_mem_obj, CL_TRUE, 0,
739 LIST_SIZE * sizeof(int), C, 0, NULL, NULL);
740
741 // Display the result to the screen
742 for(i = 0; i < LIST_SIZE; i++)
743 printf("%d + %d = %d\n", A[i], B[i], C[i]);
744 // Clean up
745 ret = clFlush(command_queue);
746 ret = clFinish(command_queue);
747 ret = clReleaseMemObject(a_mem_obj);
748 ret = clReleaseMemObject(b_mem_obj);
749 ret = clReleaseMemObject(c_mem_obj);
750 free(A);
751 free(B);
752 free(C);
753 #endif
754 #else
755 mul_MAT_out (target, x, y);
756 #endif
757 }
758
outer_MAT_out(MATVU const & target,constVECVU const & x,constVECVU const & y)759 void outer_MAT_out (MATVU const& target, constVECVU const& x, constVECVU const& y) {
760 for (integer irow = 1; irow <= x.size; irow ++)
761 for (integer icol = 1; icol <= y.size; icol ++)
762 target [irow] [icol] = x [irow] * y [icol];
763 }
outer_MAT(constVECVU const & x,constVECVU const & y)764 autoMAT outer_MAT (constVECVU const& x, constVECVU const& y) {
765 autoMAT result = raw_MAT (x.size, y.size);
766 outer_MAT_out (result.get(), x, y);
767 return result;
768 }
769
peaks_MAT(constVECVU const & x,bool includeEdges,int interpolate,bool sortByHeight)770 autoMAT peaks_MAT (constVECVU const& x, bool includeEdges, int interpolate, bool sortByHeight) {
771 if (x.size < 2) includeEdges = false;
772 integer numberOfPeaks = 0;
773 for (integer i = 2; i < x.size; i ++)
774 if (x [i] > x [i - 1] && x [i] >= x [i + 1])
775 numberOfPeaks ++;
776 if (includeEdges) {
777 if (x [1] > x [2])
778 numberOfPeaks ++;
779 if (x [x.size] > x [x.size - 1])
780 numberOfPeaks ++;
781 }
782 autoMAT result = raw_MAT (2, numberOfPeaks);
783 integer peakNumber = 0;
784 if (includeEdges && x [1] > x [2]) {
785 result [1] [++ peakNumber] = 1;
786 result [2] [peakNumber] = x [1];
787 }
788 for (integer i = 2; i < x.size; i ++) {
789 if (x [i] > x [i - 1] && x [i] >= x [i + 1]) {
790 ++ peakNumber;
791 if (interpolate != 0) { // this is not a boolean; there could follow more options
792 /*
793 Parabolic interpolation.
794 */
795 const double dy = 0.5 * (x [i + 1] - x [i - 1]);
796 const double d2y = (x [i] - x [i - 1]) + (x [i] - x [i + 1]);
797 Melder_assert (d2y > 0.0);
798 result [1] [peakNumber] = (double) i + dy / d2y;
799 result [2] [peakNumber] = x [i] + 0.5 * dy * (dy / d2y);
800 } else {
801 /*
802 Don't interpolate: choose the nearest index.
803 */
804 result [1] [peakNumber] = i;
805 result [2] [peakNumber] = x [i];
806 }
807 }
808 }
809 if (includeEdges && x [x.size] > x [x.size - 1]) {
810 result [1] [++ peakNumber] = x.size;
811 result [2] [peakNumber] = x [x.size];
812 }
813 Melder_assert (peakNumber == numberOfPeaks);
814 if (sortByHeight) {
815 for (integer i = 1; i <= numberOfPeaks; i ++)
816 result [2] [i] *= -1.0;
817 NUMsortTogether (result.row (2), result.row (1));
818
819 for (integer i = 1; i <= numberOfPeaks; i ++)
820 result [2] [i] *= -1.0;
821 }
822 return result;
823 }
824
power_MAT_out(MATVU const & target,constMATVU const & mat,double power)825 void power_MAT_out (MATVU const& target, constMATVU const& mat, double power) {
826 for (integer irow = 1; irow <= target.nrow; irow ++)
827 power_VEC_out (target.row (irow), mat.row (irow), power);
828 }
829
830 /* End of file MAT.cpp */
831