1 /* MAT.cpp
2  *
3  * Copyright (C) 2017-2021 Paul Boersma
4  *
5  * This code is free software; you can redistribute it and/or modify
6  * it under the terms of the GNU General Public License as published by
7  * the Free Software Foundation; either version 2 of the License, or (at
8  * your option) any later version.
9  *
10  * This code is distributed in the hope that it will be useful, but
11  * WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
13  * See the GNU General Public License for more details.
14  *
15  * You should have received a copy of the GNU General Public License
16  * along with this work. If not, see <http://www.gnu.org/licenses/>.
17  */
18 
19 #include "melder.h"
20 #include "../dwsys/NUM2.h"
21 //#include "../external/gsl/gsl_blas.h"
22 
23 #ifdef macintosh
24 	#include <Accelerate/Accelerate.h>
25 	#undef trace
26 	#import <MetalPerformanceShaders/MetalPerformanceShaders.h>
27 	#include <OpenCL/opencl.h>
28 #endif
29 
centreEachColumn_MAT_inout(MATVU const & x)30 void centreEachColumn_MAT_inout (MATVU const& x) noexcept {
31 	for (integer icol = 1; icol <= x.ncol; icol ++) {
32 		const double columnMean = NUMmean (x.column (icol));
33 		for (integer irow = 1; irow <= x.nrow; irow ++)
34 			x [irow] [icol] -= columnMean;
35 	}
36 }
37 
centreEachRow_MAT_inout(MATVU const & x)38 void centreEachRow_MAT_inout (MATVU const& x) noexcept {
39 	for (integer irow = 1; irow <= x.nrow; irow ++)
40 		centre_VEC_inout (x [irow]);
41 }
42 
doubleCentre_MAT_inout(MATVU const & x)43 void doubleCentre_MAT_inout (MATVU const& x) noexcept {
44 	centreEachRow_MAT_inout (x);
45 	centreEachColumn_MAT_inout (x);
46 }
47 
mtm_MAT_out(MATVU const & target,constMATVU const & x)48 void mtm_MAT_out (MATVU const& target, constMATVU const& x) noexcept {
49 	Melder_assert (target.nrow == x.ncol);
50 	Melder_assert (target.ncol == x.ncol);
51 	#if 0
52 	for (integer irow = 1; irow <= target.nrow; irow ++) {
53 		for (integer icol = irow; icol <= target.ncol; icol ++) {
54 			longdouble t = 0.0;
55 			for (integer k = 1; k <= x.nrow; k ++)
56 				t += x [k] [irow] * x [k] [icol];
57 			target [irow] [icol] = target [icol] [irow] = double (t);
58 		}
59 	}
60 	#elif 0
61 	for (integer irow = 1; irow <= target.nrow; irow ++) {
62 		for (integer icol = irow; icol <= target.ncol; icol ++) {
63 			PAIRWISE_SUM (longdouble, sum, integer, x.nrow,
64 				const double *px1 = & x [1] [irow];
65 				const double *px2 = & x [1] [icol],
66 				longdouble (*px1) * longdouble (*px2),
67 				(px1 += x.rowStride, px2 += x.rowStride)
68 			)
69 			target [irow] [icol] = target [icol] [irow] = double (sum);
70 		}
71 	}
72 	#else
73 	for (integer irow = 1; irow <= target.nrow; irow ++)
74 		for (integer icol = irow; icol <= target.ncol; icol ++)
75 			target [irow] [icol] = 0.0;
76 	for (integer k = 1; k <= x.nrow; k ++)
77 		for (integer irow = 1; irow <= target.nrow; irow ++)
78 			for (integer icol = irow; icol <= target.ncol; icol ++)
79 				target [irow] [icol] += x [k] [irow] * x [k] [icol];
80 	for (integer irow = 2; irow <= target.nrow; irow ++)
81 		for (integer icol = 1; icol < irow; icol ++)
82 			target [irow] [icol] = target [icol] [irow];
83 	#endif
84 }
85 
_mul_MAT_out(MATVU const & target,constMATVU const & x,constMATVU const & y)86 void _mul_MAT_out (MATVU const& target, constMATVU const& x, constMATVU const& y) noexcept {
87 	/*
88 		Precise matrix multiplication, using pairwise summation.
89 	*/
90 	if (x.colStride == 1) {
91 		if (y.rowStride == 1) {
92 			/*
93 				Appropriate for Target := X.Y',
94 				if X and Y are packed row-major matrices.
95 				The speed is 0.142, 0.716, 1.32, 1.64, 2.33, 2.22, 2.08, 2.31, 2.18, 1.89, 1.77, 1.53 Gflop/s
96 				for size =       1,     3,   10,   20,   50,  100,  200,  500, 1000, 2000, 3000, 5000.
97 			*/
98 			for (integer irow = 1; irow <= target.nrow; irow ++) {
99 				for (integer icol = 1; icol <= target.ncol; icol ++) {
100 					PAIRWISE_SUM (longdouble, sum, integer, x.ncol,
101 						const double *px = & x [irow] [1];
102 						const double *py = & y [1] [icol],
103 						longdouble (*px) * longdouble (*py),
104 						(px += 1, py += 1)
105 					)
106 					target [irow] [icol] = double (sum);
107 				}
108 			}
109 		} else {
110 			/*
111 				Appropriate for Target := X.Y,
112 				if X and Y are packed row-major matrices.
113 				The speed is 0.143, 0.684, 1.20, 1.64, 2.24, 2.04, 1.44, 1.22, 0.56, 0.114 Gflop/s
114 				for size =       1,     3,   10,   20,   50,  100,  200,  500, 1000,  2000.
115 			*/
116 			for (integer irow = 1; irow <= target.nrow; irow ++) {
117 				for (integer icol = 1; icol <= target.ncol; icol ++) {
118 					PAIRWISE_SUM (longdouble, sum, integer, x.ncol,
119 						const double *px = & x [irow] [1];
120 						const double *py = & y [1] [icol],
121 						longdouble (*px) * longdouble (*py),
122 						(px += 1, py += y.rowStride)
123 					)
124 					target [irow] [icol] = double (sum);
125 				}
126 			}
127 		}
128 	} else if (y.rowStride == 1) {
129 		/*
130 			Appropriate for Target := X'.Y',
131 			if X and Y are packed row-major matrices.
132 			The speed is 0.136, 0.666, 1.22, 1.65, 2.36, 1.96, 1.62, 1.24, 0.69, 0.118 Gflop/s
133 			for size =       1,     3,   10,   20,   50,  100,  200,  500, 1000,  2000.
134 		*/
135 		for (integer irow = 1; irow <= target.nrow; irow ++) {
136 			for (integer icol = 1; icol <= target.ncol; icol ++) {
137 				PAIRWISE_SUM (longdouble, sum, integer, x.ncol,
138 					const double *px = & x [irow] [1];
139 					const double *py = & y [1] [icol],
140 					longdouble (*px) * longdouble (*py),
141 					(px += x.colStride, py += 1)
142 				)
143 				target [irow] [icol] = double (sum);
144 			}
145 		}
146 	} else {
147 		/*
148 			Appropriate for Target := X'.Y,
149 			if X and Y are packed row-major matrices.
150 			The speed is 0.143, 0.572, 1.10, 1.43, 1.71, 1.70, 1.29, 0.71, 0.067 Gflop/s
151 			for size =       1,     3,   10,   20,   50,  100,  200,  500,  1000.
152 		*/
153 		for (integer irow = 1; irow <= target.nrow; irow ++) {
154 			for (integer icol = 1; icol <= target.ncol; icol ++) {
155 				PAIRWISE_SUM (longdouble, sum, integer, x.ncol,
156 					const double *px = & x [irow] [1];
157 					const double *py = & y [1] [icol],
158 					longdouble (*px) * longdouble (*py),
159 					(px += x.colStride, py += y.rowStride)
160 				)
161 				target [irow] [icol] = double (sum);
162 			}
163 		}
164 	}
165 }
166 
_mul_forceAllocation_MAT_out(MATVU const & target,constMATVU x,constMATVU y)167 void _mul_forceAllocation_MAT_out (MATVU const& target, constMATVU x, constMATVU y) {
168 	/*
169 		As seen above, the only multiplication that stays fast for large sizes,
170 		if X and Y are packed row-major matrices, is X.Y';
171 		this is because both x.colStride and y.rowStride are 1 in that case.
172 		It may therefore be useful to convert any matrix X that has
173 		a column stride unequal to 1, and any matrix Y that has a row stride
174 		unequal to 1, to matrices that do have these desirable properties.
175 
176 		For the X.Y case, where X and Y are packed row-major matrices,
177 		the speed is 0.084, 0.124, 0.91, 1.56, 2.26, 2.18, 2.12, 2.25, 2.23, 1.85, 1.78, 1.57 Gflop/s
178 		for size =       1,     3,   10,   20,   50,  100,  200,  500, 1000, 2000, 3000, 5000.
179 
180 		For the X.Y' case, where X and Y are packed row-major matrices, there is no allocation, and
181 		the speed is 0.084, 0.610, 1.26, 1.69, 2.32, 2.20, 2.12, 2.28, 2.24, 1.91, 1.76, 1.53 Gflop/s
182 		for size =       1,     3,   10,   20,   50,  100,  200,  500, 1000, 2000, 3000, 5000.
183 
184 		For the X'.Y case, where X and Y are packed row-major matrices,
185 		the speed is 0.082, 0.068, 0.73, 1.42, 2.20, 2.14, 2.09, 2.27, 2.21, 1.84, 1.77, 1.53 Gflop/s
186 		for size =       1,     3,   10,   20,   50,  100,  200,  500, 1000, 2000, 3000, 5000.
187 
188 		For the X'.Y' case, where X and Y are packed row-major matrices,
189 		the speed is 0.082, 0.117, 0.90, 1.57, 2.25, 2.19, 2.12, 2.23, 2.09, 1.92, 1.69, 1.48 Gflop/s
190 		for size =       1,     3,   10,   20,   50,  100,  200,  500, 1000, 2000, 3000, 5000.
191 	*/
192 	autoMAT tmpX, tmpY;   // the scope shall extend to the end of the function
193 	if (x.colStride != 1) {
194 		tmpX = copy_MAT (x);
195 		x = tmpX.all();
196 		Melder_assert (x.colStride == 1);
197 	}
198 	if (y.rowStride != 1) {
199 		tmpY = transpose_MAT (y);
200 		y = tmpY.transpose();
201 		Melder_assert (y.rowStride == 1);
202 	}
203 	for (integer irow = 1; irow <= target.nrow; irow ++) {
204 		for (integer icol = 1; icol <= target.ncol; icol ++) {
205 			PAIRWISE_SUM (longdouble, sum, integer, x.ncol,
206 				const double *px = & x [irow] [1];
207 				const double *py = & y [1] [icol],
208 				longdouble (*px) * longdouble (*py),
209 				(px += 1, py += 1)
210 			)
211 			target [irow] [icol] = double (sum);
212 		}
213 	}
214 }
215 
_mul_allowAllocation_MAT_out(MATVU const & target,constMATVU x,constMATVU y)216 void _mul_allowAllocation_MAT_out (MATVU const& target, constMATVU x, constMATVU y) {
217 	/*
218 		The faster of _mul_MAT_out and _mul_forceAllocation_MAT_out.
219 		Allocation takes place only for larger matrices, e.g. from size 47 on
220 		(100,000 flops).
221 
222 		For the X.Y case, where X and Y are packed row-major matrices,
223 		the speed is 0.204, 1.130, 3.74, 3.47, 4.77, 3.81, 4.05, 4.26, 3.92, 2.82, 2.83, 2.80 Gflop/s on 2.9 GHz i9
224 		the speed is 0.158, 1.452, 3.70, 4.40, 6.01, 4.55, 4.56, 4.76, 4.84, 3.23, 3.23, 3.26 Gflop/s with SSE 4.2
225 		the speed is 0.135, 1.483, 3.85, 4.80, 5.60, 4.49, 4.43, 4.52, 5.05, 3.31, 3.19, 3.25 Gflop/s with AVX2
226 		the speed is 0.087, 0.574, 1.18, 1.61, 2.25, 2.14, 2.11, 2.23, 2.23, 1.88, 1.74, 1.53 Gflop/s
227 		for size =       1,     3,   10,   20,   50,  100,  200,  500, 1000, 2000, 3000, 5000.
228 
229 		For the X.Y' case, where X and Y are packed row-major matrices, there is never allocation, and
230 		the speed is 0.088, 0.577, 1.28, 1.67, 2.27, 2.18, 2.12, 2.28, 2.20, 1.96, 1.78, 1.57 Gflop/s
231 		for size =       1,     3,   10,   20,   50,  100,  200,  500, 1000, 2000, 3000, 5000.
232 
233 		For the X'.Y case, where X and Y are packed row-major matrices,
234 		the speed is 0.084, 0.547, 1.12, 1.44, 2.20, 2.15, 2.04, 2.25, 2.18, 1.92, 1.74, 1.48 Gflop/s
235 		for size =       1,     3,   10,   20,   50,  100,  200,  500, 1000, 2000, 3000, 5000.
236 
237 		For the X'.Y' case, where X and Y are packed row-major matrices,
238 		the speed is 0.084, 0.553, 1.18, 1.63, 2.31, 2.12, 2.12, 2.25, 2.16, 1.90, 1.79, 1.50 Gflop/s
239 		for size =       1,     3,   10,   20,   50,  100,  200,  500, 1000, 2000, 3000, 5000.
240 	*/
241 	if (x.colStride == 1) {
242 		if (y.rowStride == 1) {
243 			for (integer irow = 1; irow <= target.nrow; irow ++) {
244 				for (integer icol = 1; icol <= target.ncol; icol ++) {
245 					PAIRWISE_SUM (longdouble, sum, integer, x.ncol,
246 						const double *px = & x [irow] [1];
247 						const double *py = & y [1] [icol],
248 						longdouble (*px) * longdouble (*py),
249 						(px += 1, py += 1)
250 					)
251 					target [irow] [icol] = double (sum);
252 				}
253 			}
254 		} else {
255 			if (double (target.nrow) * double (target.ncol) * double (x.ncol) > 1e5) {
256 				autoMAT tmpY = transpose_MAT (y);
257 				y = tmpY.transpose();
258 				Melder_assert (y.rowStride == 1);
259 				for (integer irow = 1; irow <= target.nrow; irow ++) {
260 					for (integer icol = 1; icol <= target.ncol; icol ++) {
261 						PAIRWISE_SUM (longdouble, sum, integer, x.ncol,
262 							const double *px = & x [irow] [1];
263 							const double *py = & y [1] [icol],
264 							longdouble (*px) * longdouble (*py),
265 							(px += 1, py += 1)
266 						)
267 						target [irow] [icol] = double (sum);
268 					}
269 				}
270 			} else {
271 				for (integer irow = 1; irow <= target.nrow; irow ++) {
272 					for (integer icol = 1; icol <= target.ncol; icol ++) {
273 						PAIRWISE_SUM (longdouble, sum, integer, x.ncol,
274 							const double *px = & x [irow] [1];
275 							const double *py = & y [1] [icol],
276 							longdouble (*px) * longdouble (*py),
277 							(px += 1, py += y.rowStride)
278 						)
279 						target [irow] [icol] = double (sum);
280 					}
281 				}
282 			}
283 		}
284 	} else if (y.rowStride == 1) {
285 		if (double (target.nrow) * double (target.ncol) * double (x.ncol) > 1e5) {
286 			autoMAT tmpX = copy_MAT (x);
287 			x = tmpX.all();
288 			Melder_assert (x.colStride == 1);
289 			for (integer irow = 1; irow <= target.nrow; irow ++) {
290 				for (integer icol = 1; icol <= target.ncol; icol ++) {
291 					PAIRWISE_SUM (longdouble, sum, integer, x.ncol,
292 						const double *px = & x [irow] [1];
293 						const double *py = & y [1] [icol],
294 						longdouble (*px) * longdouble (*py),
295 						(px += 1, py += 1)
296 					)
297 					target [irow] [icol] = double (sum);
298 				}
299 			}
300 		} else {
301 			for (integer irow = 1; irow <= target.nrow; irow ++) {
302 				for (integer icol = 1; icol <= target.ncol; icol ++) {
303 					PAIRWISE_SUM (longdouble, sum, integer, x.ncol,
304 						const double *px = & x [irow] [1];
305 						const double *py = & y [1] [icol],
306 						longdouble (*px) * longdouble (*py),
307 						(px += x.colStride, py += 1)
308 					)
309 					target [irow] [icol] = double (sum);
310 				}
311 			}
312 		}
313 	} else {
314 		if (double (target.nrow) * double (target.ncol) * double (x.ncol) > 1e5) {
315 			autoMAT tmpX = copy_MAT (x);
316 			x = tmpX.all();
317 			Melder_assert (x.colStride == 1);
318 			autoMAT tmpY = transpose_MAT (y);
319 			y = tmpY.transpose();
320 			Melder_assert (y.rowStride == 1);
321 			for (integer irow = 1; irow <= target.nrow; irow ++) {
322 				for (integer icol = 1; icol <= target.ncol; icol ++) {
323 					PAIRWISE_SUM (longdouble, sum, integer, x.ncol,
324 						const double *px = & x [irow] [1];
325 						const double *py = & y [1] [icol],
326 						longdouble (*px) * longdouble (*py),
327 						(px += 1, py += 1)
328 					)
329 					target [irow] [icol] = double (sum);
330 				}
331 			}
332 		} else {
333 			for (integer irow = 1; irow <= target.nrow; irow ++) {
334 				for (integer icol = 1; icol <= target.ncol; icol ++) {
335 					PAIRWISE_SUM (longdouble, sum, integer, x.ncol,
336 						const double *px = & x [irow] [1];
337 						const double *py = & y [1] [icol],
338 						longdouble (*px) * longdouble (*py),
339 						(px += x.colStride, py += y.rowStride)
340 					)
341 					target [irow] [icol] = double (sum);
342 				}
343 			}
344 		}
345 	}
346 }
347 
MATmul_rough_naiveReferenceImplementation(MATVU const & target,constMATVU const & x,constMATVU const & y)348 static inline void MATmul_rough_naiveReferenceImplementation (MATVU const& target, constMATVU const& x, constMATVU const& y) noexcept {
349 	/*
350 		If x.colStride == size and y.colStride == 1,
351 		this version is 0.073, 1.32, 1.17, 0.58 Gflop/s for size = 1,10,100,1000.
352 	*/
353 	for (integer irow = 1; irow <= target.nrow; irow ++) {
354 		for (integer icol = 1; icol <= target.ncol; icol ++) {
355 			target [irow] [icol] = 0.0;
356 			for (integer i = 1; i <= x.ncol; i ++)
357 				target [irow] [icol] += x [irow] [i] * y [i] [icol];
358 		}
359 	}
360 }
_mul_fast_MAT_out(MATVU const & target,constMATVU const & x,constMATVU const & y)361 void _mul_fast_MAT_out (MATVU const& target, constMATVU const& x, constMATVU const& y) noexcept {
362 	if ((false)) {
363 		MATmul_rough_naiveReferenceImplementation (target, x, y);
364 	} else if (y.colStride == 1) {
365 		/*
366 			This case is appropriate for the multiplication of full matrices
367 				X.Y
368 			or
369 				X'.Y
370 
371 			The speed for X.Y is 0.053, 1.37, 3.14, 2.99, 2.38, 2.06, 1.70 Gflop/s
372 			for size =               1,   10,  100, 1000, 2000, 3000, 5000.
373 			The speed for X'.Y is 0.063, 1.37, 3.11, 2.72 Gflop/s for size = 1,10,100,1000.
374 
375 			The trick is to have the inner loop run along two fastest indices;
376 			for target as well as y, this fastest index is the last index.
377 			Note that the multiplication factor within the inner loop is constant,
378 			so we move it out of the loop (by hand, in case the compiler doesn't do it).
379 		*/
380 		#if 1
381 		for (integer irow = 1; irow <= target.nrow; irow ++) {
382 			VECVU const targetrow = target [irow];
383 			for (integer icol = 1; icol <= target.ncol; icol ++)
384 				targetrow [icol] = 0.0;
385 			for (integer i = 1; i <= x.ncol; i ++) {
386 				const double xcell = x [irow] [i];
387 				constVECVU const yrow = y [i];
388 				for (integer icol = 1; icol <= target.ncol; icol ++)
389 					targetrow [icol] += xcell * yrow [icol];
390 			}
391 		}
392 		#elif 0
393 			/*
394 				Using 64-bit BLAS from Apple's Accelerate framework.
395 				The speed for X.Y is 0.037, 1.51, 3.40, 2.40, 1.82, 1.64, 1.04 Gflop/s
396 				for size =               1,   10,  100, 1000, 2000, 3000, 5000.
397 				This is not really faster than our own simple implementation
398 				(perhaps 32-bit BLAS is faster, because it can use more flops per cycle).
399 			*/
400 			cblas_dgemm (CblasRowMajor, CblasNoTrans, CblasNoTrans,
401 				target.nrow, target.ncol, x.ncol,
402 				1.0,
403 				& x [1] [1], x.rowStride,
404 				& y [1] [1], y.rowStride,
405 				0.0,
406 				& target [1] [1], target.rowStride
407 			);
408 		#else
409 		/*
410 			An implementation that is notationally ideal.
411 			Does the compiler manage to move the constant parts
412 			of the expression outside the loop?
413 
414 			The speed for X.Y is 0.056, 1.08, 2.99, 2.87 Gflop/s for size = 1,10,100,1000.
415 		*/
416 		for (integer irow = 1; irow <= target.nrow; irow ++) {
417 			for (integer icol = 1; icol <= target.ncol; icol ++)
418 				target [irow] [icol] = 0.0;
419 			for (integer i = 1; i <= x.ncol; i ++)
420 				for (integer icol = 1; icol <= target.ncol; icol ++)
421 					target [irow] [icol] += x [irow] [i] * y [i] [icol];
422 		}
423 		#endif
424 	} else if (y.rowStride == 1) {
425 		if (x.colStride == 1) {
426 			/*
427 				This case will be appropriate for the multiplication of full matrices
428 					X.Y'
429 				The speed is 0.064, 1.18, 1.67, 1.69 Gflop/s for size = 1,10,100,1000.
430 			*/
431 			_mul_MAT_out (target, x, y);
432 		} else {
433 			/*
434 				This case will be appropriate for the multiplication of full matrices
435 					X'.Y'
436 
437 				So we will make this fast by making the target matrix column-major.
438 				The speed will be 0.065, 1.27, 1.45, 1.21 Gflop/s for size = 1,10,100,1000.
439 				However, this will work only once an automatrix has row and column strides;
440 				until that time we cannot really modify the structure of `target`.
441 
442 				For the moment, the target has to stay row-major.
443 				The speed is 0.064, 1.21, 1.41, 0.43 Gflop/s for size = 1,10,100,1000.
444 
445 				The trick is to have the inner loop run along two fastest indices;
446 				for both target (in future) and y, this fastest index is the first index.
447 			*/
448 			//target.rowStride = 1;
449 			//target.colStride = target.nrow;
450 			for (integer icol = 1; icol <= target.ncol; icol ++) {
451 				VECVU const targetcolumn = target.column (icol);
452 				for (integer irow = 1; irow <= target.nrow; irow ++)
453 					targetcolumn [irow] = 0.0;
454 				for (integer i = 1; i <= x.ncol; i ++) {
455 					constVECVU const ycolumn = y.column (i);
456 					for (integer irow = 1; irow <= target.nrow; irow ++)
457 						targetcolumn [irow] += x [irow] [i] * ycolumn [irow];
458 				}
459 			}
460 		}
461 	} else {
462 		/*
463 			A rare case: the strides of y are both greater than 1.
464 			We do not bother to optimize these cases yet.
465 		*/
466 		MATmul_rough_naiveReferenceImplementation (target, x, y);
467 	}
468 }
469 
MATmul_forceMetal_(MATVU const & target,constMATVU const & x,constMATVU const & y)470 void MATmul_forceMetal_ (MATVU const& target, constMATVU const& x, constMATVU const& y) {
471 #ifdef macintosh
472 	if (@available (macOS 10.13, *)) {
473 		/*
474 			The speed is 0.000'005, 0.004, 1.86, 17.1, 13.8,  58, 41.4,  76, 42.5,  88,  63,  120,  255,  337,  525,  577,   620,   734,   769,   798,   894,   857,   868,   900,   872,   875,   956,   914                      Gflop/s AMD Radeon Pro Vega 20 (4 GB)
475 			The speed is 0.000'007, 0.006, 2.37, 20.3, 15.7,  42, 35.7,  73, 54.3,  90,  72,  118,  245,  349,  575,  667,  1024,  1028,  1090,  1175,  1220,  1206,  1249,  1285,  1265,  1260,  1302,  1311,  1336,  1192,  1226 Gflop/s AMD Radeon RX Vega 56 (BlackMagic, 8 GB)
476 			for size =           1,    10,  100,  200,  300, 400,  500, 600,  700, 800, 900, 1000, 2000, 3000, 5000, 7000, 10000, 12000, 15000, 17000, 18000, 18500, 18700, 18800, 18900, 19000, 20000, 21000, 22000, 25000, 30000.
477 		*/
478 		static bool gpuInited = false;
479 		static id <MTLDevice> gpuDevice;
480 		static id <MTLCommandQueue> gpuQueue;
481 		if (! gpuInited) {
482 			NSArray <id<MTLDevice>> *gpuDeviceList = MTLCopyAllDevices ();
483 			NSUInteger numberOfGpuDevices = [gpuDeviceList count];
484 			Melder_casual (U"Found ", numberOfGpuDevices, U" GPU devices.");
485 			if (numberOfGpuDevices < 2) {
486 				/*
487 					Easy choice.
488 				*/
489 				gpuDevice = MTLCreateSystemDefaultDevice ();
490 			} else {
491 				int externalGpuDeviceNumber = -1, discreteGpuDeviceNumber = -1;
492 				for (NSUInteger idevice = 0; idevice < numberOfGpuDevices; idevice ++) {
493 					id <MTLDevice> device = [gpuDeviceList objectAtIndex: idevice];
494 					autostring32 deviceName = Melder_8to32 ([[device name] UTF8String]);
495 					Melder_casual (U"GPU device ", idevice, U": ", deviceName.get());
496 					if (device. removable)
497 						externalGpuDeviceNumber = int (idevice);
498 					else if (! device. lowPower)
499 						discreteGpuDeviceNumber = int (idevice);
500 				}
501 				if (externalGpuDeviceNumber != -1)
502 					gpuDevice = [gpuDeviceList objectAtIndex: NSUInteger (externalGpuDeviceNumber)];
503 				else if (discreteGpuDeviceNumber != -1)
504 					gpuDevice = [gpuDeviceList objectAtIndex: NSUInteger (discreteGpuDeviceNumber)];
505 				else
506 					gpuDevice = MTLCreateSystemDefaultDevice ();   // unlikely fallback
507 			}
508 			autostring32 deviceName = Melder_8to32 ([[gpuDevice name] UTF8String]);
509 			Melder_casual (U"GPU device for computing: ", deviceName.get());
510 			gpuInited = true;
511 			gpuQueue = [gpuDevice newCommandQueue];
512 		}
513 //Melder_casual (U"start ", Melder_stopwatch ());
514 		MPSMatrixMultiplication *matrixMultiplication = [[MPSMatrixMultiplication alloc]
515 			initWithDevice: gpuDevice
516 			resultRows: integer_to_uinteger (target.nrow)
517 			resultColumns: integer_to_uinteger (target.ncol)
518 			interiorColumns: integer_to_uinteger (x.ncol)
519 		];
520 		Melder_assert (matrixMultiplication != nil);
521 
522 		uinteger xRowStrideInBytes = [MPSMatrixDescriptor rowBytesForColumns: uinteger (x.ncol)  dataType: MPSDataTypeFloat32];
523 		uinteger xRowStrideInFloats = xRowStrideInBytes / sizeof (float);
524 		autovector <float> x32 = newvectorzero <float> (integer (uinteger (x.nrow) * xRowStrideInFloats));
525 		for (integer irow = 1; irow <= x.nrow; irow ++) {
526 			float *prow = & x32 [1] + uinteger (irow - 1) * xRowStrideInFloats;
527 			for (integer icol = 1; icol <= x.ncol; icol ++)
528 				*prow ++ = float (x [irow] [icol]);
529 		}
530 
531 		uinteger yRowStrideInBytes = [MPSMatrixDescriptor rowBytesForColumns: uinteger (y.ncol)  dataType: MPSDataTypeFloat32];
532 		uinteger yRowStrideInFloats = yRowStrideInBytes / sizeof (float);
533 		autovector <float> y32 = newvectorzero <float> (integer (uinteger (y.nrow) * yRowStrideInFloats));
534 		for (integer irow = 1; irow <= y.nrow; irow ++) {
535 			float *prow = & y32 [1] + uinteger (irow - 1) * yRowStrideInFloats;
536 			for (integer icol = 1; icol <= y.ncol; icol ++)
537 				*prow ++ = float (y [irow] [icol]);
538 		}
539 
540 		uinteger targetRowStrideInBytes = [MPSMatrixDescriptor rowBytesForColumns: uinteger (y.ncol)  dataType: MPSDataTypeFloat32];
541 		uinteger targetRowStrideInFloats = targetRowStrideInBytes / sizeof (float);
542 		autovector <float> target32 = newvectorzero <float> (integer (uinteger (target.nrow) * targetRowStrideInFloats));
543 
544 		uinteger x32length = uinteger (x.nrow) * xRowStrideInBytes;
545 		id <MTLBuffer> bufferX = [gpuDevice newBufferWithBytes: & x32 [1]  length: x32length  options: MTLResourceStorageModeManaged];
546 		Melder_assert (bufferX != nil);
547 
548 		uinteger y32length = uinteger (y.nrow) * yRowStrideInBytes;
549 		id <MTLBuffer> bufferY = [gpuDevice newBufferWithBytes: & y32 [1]  length: y32length  options: MTLResourceStorageModeManaged];
550 		Melder_assert (bufferY != nil);
551 
552 		uinteger target32length = uinteger (target.nrow) * targetRowStrideInBytes;
553 		id <MTLBuffer> bufferTarget = [gpuDevice newBufferWithBytes: & target32 [1]  length: target32length  options: MTLResourceStorageModeShared];
554 		Melder_assert (bufferTarget != nil);
555 //Melder_casual (U"to GPU ", Melder_stopwatch ());
556 
557 		MPSMatrixDescriptor *descriptorX =
558 			[MPSMatrixDescriptor matrixDescriptorWithRows: uinteger (x.nrow)
559 				columns: uinteger (x.ncol)
560 				rowBytes: xRowStrideInBytes
561 				dataType: MPSDataTypeFloat32];
562 		MPSMatrixDescriptor *descriptorY =
563 			[MPSMatrixDescriptor matrixDescriptorWithRows: uinteger (y.nrow)
564 				columns: uinteger (y.ncol)
565 				rowBytes: yRowStrideInBytes
566 				dataType: MPSDataTypeFloat32];
567 		MPSMatrixDescriptor *descriptorTarget =
568 			[MPSMatrixDescriptor matrixDescriptorWithRows: uinteger (target.nrow)
569 				columns: uinteger (target.ncol)
570 				rowBytes: targetRowStrideInBytes
571 				dataType: MPSDataTypeFloat32];
572 
573 		MPSMatrix *mpsX = [[MPSMatrix alloc] initWithBuffer: bufferX descriptor: descriptorX];
574 		Melder_assert (mpsX != nil);
575 		MPSMatrix *mpsY = [[MPSMatrix alloc] initWithBuffer: bufferY descriptor: descriptorY];
576 		Melder_assert (mpsY != nil);
577 		MPSMatrix *mpsTarget = [[MPSMatrix alloc] initWithBuffer: bufferTarget descriptor: descriptorTarget];
578 		Melder_assert (mpsTarget != nil);
579 
580 		NSAutoreleasePool *pool = [[NSAutoreleasePool alloc] init];
581 		id <MTLCommandBuffer> commandBuffer = [gpuQueue commandBuffer];   // autoreleased
582 		[matrixMultiplication encodeToCommandBuffer: commandBuffer
583 			leftMatrix: mpsX
584 			rightMatrix: mpsY
585 			resultMatrix: mpsTarget];
586 		[commandBuffer commit];
587 		/*
588 			For testing.
589 		*/
590 		constexpr integer numberOfTimes = 0;
591 		id <MTLCommandBuffer> commandBuffers [numberOfTimes];
592 		for (integer itime = 0; itime < numberOfTimes; itime ++) {
593 			commandBuffers [itime] = [gpuQueue commandBuffer];
594 			[matrixMultiplication encodeToCommandBuffer: commandBuffers [itime]
595 				leftMatrix: mpsX
596 				rightMatrix: mpsTarget
597 				resultMatrix: mpsY];
598 			[matrixMultiplication encodeToCommandBuffer: commandBuffers [itime]
599 				leftMatrix: mpsX
600 				rightMatrix: mpsY
601 				resultMatrix: mpsTarget];
602 			[commandBuffers [itime] commit];
603 		}
604 		[commandBuffer waitUntilCompleted];
605 		for (integer itime = 0; itime < numberOfTimes; itime ++) {
606 			[commandBuffers [itime] waitUntilCompleted];
607 		}
608 //Melder_casual (U"in GPU ", Melder_stopwatch ());
609 		NSError *error = [commandBuffer error];
610 		if (error) {
611 			/*
612 				Save the error messages before the release of `commandBuffer` invalidates `error`.
613 			*/
614 			autostring32 localizedDescription = Melder_8to32 ([[error localizedDescription] UTF8String]);
615 			autostring32 localizedFailureReason = Melder_8to32 ([[error localizedFailureReason] UTF8String]);
616 			autostring32 localizedRecoverySuggestion = Melder_8to32 ([[error localizedRecoverySuggestion] UTF8String]);
617 			[bufferX release];
618 			[bufferY release];
619 			[bufferTarget release];
620 			[matrixMultiplication release];
621 			[mpsX release];
622 			[mpsY release];
623 			[mpsTarget release];
624 			[pool release];   // this releases `commandBuffer`
625 			Melder_throw (U"Matrix multiplication in Metal: Error during execution: ",
626 				localizedDescription.get(), localizedFailureReason.get(), localizedRecoverySuggestion.get());
627 		}
628 		[error release];
629 		const float * const rawPointer = (const float *) [bufferTarget contents];
630 		for (integer irow = 1; irow <= target.nrow; irow ++) {
631 			const float * prow = rawPointer + uinteger (irow - 1) * targetRowStrideInFloats;
632 			for (integer icol = 1; icol <= target.ncol; icol ++) {
633 				const double value = double (*prow ++);
634 				target [irow] [icol] = value;
635 			}
636 		}
637 //Melder_casual (U"from GPU ", Melder_stopwatch ());
638 		[bufferX release];
639 		[bufferY release];
640 		[bufferTarget release];
641 		[matrixMultiplication release];
642 		[mpsX release];
643 		[mpsY release];
644 		[mpsTarget release];
645 		//[descriptorX release];   // apparently the MPSMatrix objects have become owners?
646 		//[descriptorY release];
647 		//[descriptorTarget release];
648 		[pool release];   // this releases `commandBuffer`
649 		/*
650 			Check the result.
651 		*/
652 		//return;
653 		const integer numberOfChecks = Melder_iround (pow (target.nrow * target.ncol, 0.33333));
654 		integer numberOfUnexpectedZeroes = 0;
655 		for (integer icheck = 1; icheck <= numberOfChecks; icheck ++) {
656 			const integer rowNumber = NUMrandomInteger (1, target.nrow);
657 			const integer columnNumber = NUMrandomInteger (1, target.ncol);
658 			const double targetValue = target [rowNumber] [columnNumber];
659 			double checkedValue = 0.0;
660 			for (integer i = 1; i <= x.ncol; i ++)
661 				checkedValue += x [rowNumber] [i] * y [i] [columnNumber];
662 			if (checkedValue != 0.0) {
663 				//Melder_require (targetValue != 0.0,
664 				//	U"GPU matrix multiplication incorrect: unexpected zero at row ", rowNumber, U" and column ", columnNumber, U": value should be ", checkedValue, U".");
665 				if (targetValue == 0.0) {
666 					numberOfUnexpectedZeroes ++;
667 				} else {
668 					const double relativeError = fabs (checkedValue / targetValue - 1.0);
669 					if (relativeError > 10.0) Melder_casual
670 							(U"GPU matrix multiplication warning: unexpected imprecision of ", relativeError, U".");
671 				}
672 			}
673 		}
674 		if (numberOfUnexpectedZeroes > 0) Melder_casual
675 				(U"GPU matrix multiplication warning: found ", numberOfUnexpectedZeroes, U" unexpected zeroes, out of ", numberOfChecks, U".");
676 		return;
677 	}
678 #else
679 	mul_MAT_out (target, x, y);
680 #endif
681 }
682 
MATmul_forceOpenCL_(MATVU const & target,constMATVU const & x,constMATVU const & y)683 void MATmul_forceOpenCL_ (MATVU const& target, constMATVU const& x, constMATVU const& y) {
684 #ifdef macintosh
685 	static bool gpuInited = false;
686 	static cl_context openclContext = nullptr;
687 	static cl_command_queue commandQueue = nullptr;
688 	static cl_kernel kernel = nullptr;
689 	if (! gpuInited) {
690 		cl_platform_id platformIDs = nullptr;
691 		cl_uint numberOfPlatforms;
692 		cl_int status = clGetPlatformIDs (1, & platformIDs, & numberOfPlatforms);
693 		cl_device_id deviceIDs = nullptr;
694 		cl_uint numberOfDevices;
695 		status = clGetDeviceIDs (platformIDs, CL_DEVICE_TYPE_GPU, 1, & deviceIDs, & numberOfDevices);
696 		openclContext = clCreateContext (nullptr, 1, & deviceIDs, nullptr, nullptr, & status);
697 		commandQueue = clCreateCommandQueue (openclContext, deviceIDs, 0, & status);
698 		conststring8 sourceText = "__kernel void vector_add(__global const int *A, __global const int *B, __global int *C) {"
699 
700     /* Get the index of the current element to be processed */
701     	"int i = get_global_id(0);"
702 
703     /* Do the operation */
704     	"C[i] = A[i] + B[i];"
705 		"}";
706 		size_t sourceSize = strlen (sourceText);
707 		cl_program program = clCreateProgramWithSource (openclContext, 1,
708             (const char **) & sourceText, (const size_t *) & sourceSize, & status);
709 		status = clBuildProgram (program, 1, & deviceIDs, nullptr, nullptr, nullptr);
710 		kernel = clCreateKernel (program, "MATmul_opencl", & status);
711 	}
712 	#if 0
713 	// Create memory buffers on the device for each vector
714     cl_mem a_mem_obj = clCreateBuffer (context, CL_MEM_READ_ONLY,
715             LIST_SIZE * sizeof(int), NULL, &ret);
716     cl_mem b_mem_obj = clCreateBuffer (context, CL_MEM_READ_ONLY,
717             LIST_SIZE * sizeof(int), NULL, &ret);
718     cl_mem c_mem_obj = clCreateBuffer (context, CL_MEM_WRITE_ONLY,
719             LIST_SIZE * sizeof(int), NULL, &ret);
720 	// Copy the lists A and B to their respective memory buffers
721     ret = clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0,
722             LIST_SIZE * sizeof(int), A, 0, NULL, NULL);
723     ret = clEnqueueWriteBuffer(command_queue, b_mem_obj, CL_TRUE, 0,
724             LIST_SIZE * sizeof(int), B, 0, NULL, NULL);
725 	// Set the arguments of the kernel
726     ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&a_mem_obj);
727     ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&b_mem_obj);
728     ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&c_mem_obj);
729 
730     // Execute the OpenCL kernel on the list
731     size_t global_item_size = LIST_SIZE; // Process the entire lists
732     size_t local_item_size = 64; // Divide work items into groups of 64
733     ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL,
734             &global_item_size, &local_item_size, 0, NULL, NULL);
735 
736     // Read the memory buffer C on the device to the local variable C
737     int *C = (int*)malloc(sizeof(int)*LIST_SIZE);
738     ret = clEnqueueReadBuffer(command_queue, c_mem_obj, CL_TRUE, 0,
739             LIST_SIZE * sizeof(int), C, 0, NULL, NULL);
740 
741     // Display the result to the screen
742     for(i = 0; i < LIST_SIZE; i++)
743         printf("%d + %d = %d\n", A[i], B[i], C[i]);
744     // Clean up
745     ret = clFlush(command_queue);
746     ret = clFinish(command_queue);
747    ret = clReleaseMemObject(a_mem_obj);
748     ret = clReleaseMemObject(b_mem_obj);
749     ret = clReleaseMemObject(c_mem_obj);
750     free(A);
751     free(B);
752     free(C);
753     #endif
754 #else
755 	mul_MAT_out (target, x, y);
756 #endif
757 }
758 
outer_MAT_out(MATVU const & target,constVECVU const & x,constVECVU const & y)759 void outer_MAT_out (MATVU const& target, constVECVU const& x, constVECVU const& y) {
760 	for (integer irow = 1; irow <= x.size; irow ++)
761 		for (integer icol = 1; icol <= y.size; icol ++)
762 			target [irow] [icol] = x [irow] * y [icol];
763 }
outer_MAT(constVECVU const & x,constVECVU const & y)764 autoMAT outer_MAT (constVECVU const& x, constVECVU const& y) {
765 	autoMAT result = raw_MAT (x.size, y.size);
766 	outer_MAT_out (result.get(), x, y);
767 	return result;
768 }
769 
peaks_MAT(constVECVU const & x,bool includeEdges,int interpolate,bool sortByHeight)770 autoMAT peaks_MAT (constVECVU const& x, bool includeEdges, int interpolate, bool sortByHeight) {
771 	if (x.size < 2) includeEdges = false;
772 	integer numberOfPeaks = 0;
773 	for (integer i = 2; i < x.size; i ++)
774 		if (x [i] > x [i - 1] && x [i] >= x [i + 1])
775 			numberOfPeaks ++;
776 	if (includeEdges) {
777 		if (x [1] > x [2])
778 			numberOfPeaks ++;
779 		if (x [x.size] > x [x.size - 1])
780 			numberOfPeaks ++;
781 	}
782 	autoMAT result = raw_MAT (2, numberOfPeaks);
783 	integer peakNumber = 0;
784 	if (includeEdges && x [1] > x [2]) {
785 		result [1] [++ peakNumber] = 1;
786 		result [2] [peakNumber] = x [1];
787 	}
788 	for (integer i = 2; i < x.size; i ++) {
789 		if (x [i] > x [i - 1] && x [i] >= x [i + 1]) {
790 			++ peakNumber;
791 			if (interpolate != 0) {   // this is not a boolean; there could follow more options
792 				/*
793 					Parabolic interpolation.
794 				*/
795 				const double dy = 0.5 * (x [i + 1] - x [i - 1]);
796 				const double d2y = (x [i] - x [i - 1]) + (x [i] - x [i + 1]);
797 				Melder_assert (d2y > 0.0);
798 				result [1] [peakNumber] = (double) i + dy / d2y;
799 				result [2] [peakNumber] = x [i] + 0.5 * dy * (dy / d2y);
800 			} else {
801 				/*
802 					Don't interpolate: choose the nearest index.
803 				*/
804 				result [1] [peakNumber] = i;
805 				result [2] [peakNumber] = x [i];
806 			}
807 		}
808 	}
809 	if (includeEdges && x [x.size] > x [x.size - 1]) {
810 		result [1] [++ peakNumber] = x.size;
811 		result [2] [peakNumber] = x [x.size];
812 	}
813 	Melder_assert (peakNumber == numberOfPeaks);
814 	if (sortByHeight) {
815 		for (integer i = 1; i <= numberOfPeaks; i ++)
816 			result [2] [i] *= -1.0;
817 		NUMsortTogether (result.row (2), result.row (1));
818 
819 		for (integer i = 1; i <= numberOfPeaks; i ++)
820 			result [2] [i] *= -1.0;
821 	}
822 	return result;
823 }
824 
power_MAT_out(MATVU const & target,constMATVU const & mat,double power)825 void power_MAT_out (MATVU const& target, constMATVU const& mat, double power) {
826 	for (integer irow = 1; irow <= target.nrow; irow ++)
827 		power_VEC_out (target.row (irow), mat.row (irow), power);
828 }
829 
830 /* End of file MAT.cpp */
831