1 /**********************************************************************************************/
2 /* This program is part of the Barcelona OpenMP Tasks Suite */
3 /* Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion */
4 /* Copyright (C) 2009 Universitat Politecnica de Catalunya */
5 /* */
6 /* This program is free software; you can redistribute it and/or modify */
7 /* it under the terms of the GNU General Public License as published by */
8 /* the Free Software Foundation; either version 2 of the License, or */
9 /* (at your option) any later version. */
10 /* */
11 /* This program is distributed in the hope that it will be useful, */
12 /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
13 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
14 /* GNU General Public License for more details. */
15 /* */
16 /* You should have received a copy of the GNU General Public License */
17 /* along with this program; if not, write to the Free Software */
18 /* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */
19 /**********************************************************************************************/
20 #include <iostream>
21 #include <random>
22 #include <chrono>
23 #include <cassert>
24 #include <fstream>
25
26 #pragma once
27
28 typedef double REAL;
29 typedef unsigned long PTR;
30
31 /* MATRIX_SIZE must be a power of 2 and a multiple of 16 */
32 #define MATRIX_SIZE 1024
33
34 /* Below this cut off strassen uses MultiplyByDivideAndConquer() algorithm */
35 //#define CUTOFF_SIZE 2*1024
36 #define CUTOFF_SIZE 64
37
38 inline double *MatrixA, *MatrixB, *MatrixC;
39
40
41 /* ******************************************************************* */
42 /* STRASSEN APPLICATION CUT OFF's */
43 /* ******************************************************************* */
44 /* Strassen uses three different functions to compute Matrix Multiply. */
45 /* Each of them is related to an application cut off value: */
46 /* - Initial algorithm: OptimizedStrassenMultiply() */
47 /* - bots_app_cutoff_value: MultiplyByDivideAndConquer() */
48 /* - SizeAtWhichNaiveAlgorithmIsMoreEfficient: FastAdditiveNaiveMatrixMultiply() */
49 /* ******************************************************************* */
50
51 /*FIXME: at the moment we use a constant value, change to parameter ???*/
52 /* Below this cut off strassen uses FastAdditiveNaiveMatrixMultiply algorithm */
53 #define SizeAtWhichNaiveAlgorithmIsMoreEfficient 16
54
55 /***********************************************************************
56 * maximum tolerable relative error (for the checking routine)
57 **********************************************************************/
58 #define EPSILON (1.0E-6)
59 /***********************************************************************
60 * Matrices are stored in row-major order; A is a pointer to
61 * the first element of the matrix, and an is the number of elements
62 * between two rows. This macro produces the element A[i,j]
63 * given A, an, i and j
64 **********************************************************************/
65 #define ELEM(A, an, i, j) (A[(i)*(an)+(j)])
66
67 void FastNaiveMatrixMultiply(REAL *C, REAL *A, REAL *B, unsigned MatrixSize,
68 unsigned RowWidthC, unsigned RowWidthA, unsigned RowWidthB);
69 void FastAdditiveNaiveMatrixMultiply(REAL *C, REAL *A, REAL *B, unsigned MatrixSize,
70 unsigned RowWidthC, unsigned RowWidthA, unsigned RowWidthB);
71 void MultiplyByDivideAndConquer(REAL *C, REAL *A, REAL *B,
72 unsigned MatrixSize,
73 unsigned RowWidthC,
74 unsigned RowWidthA,
75 unsigned RowWidthB,
76 int AdditiveMode
77 );
78 void OptimizedStrassenMultiply_par(REAL *C, REAL *A, REAL *B, unsigned MatrixSize,
79 unsigned RowWidthC, unsigned RowWidthA, unsigned RowWidthB, int Depth);
80 REAL *alloc_matrix(int n);
81
82
83
84
85
86 /************************************************************************
87 * Set an n by n matrix A to random values. The distance between
88 * rows is an
89 ************************************************************************/
init_matrix(int n,REAL * A,int an)90 inline void init_matrix(int n, REAL *A, int an) {
91 int i, j;
92
93 for (i = 0; i < n; ++i)
94 for (j = 0; j < n; ++j)
95 ELEM(A, an, i, j) = ((double) rand()) / (double) RAND_MAX;
96 }
97
98 /************************************************************************
99 * Compare two matrices. Print an error message if they differ by
100 * more than EPSILON.
101 ************************************************************************/
compare_matrix(int n,REAL * A,int an,REAL * B,int bn)102 inline int compare_matrix(int n, REAL *A, int an, REAL *B, int bn) {
103 int i, j;
104 REAL c;
105
106 for (i = 0; i < n; ++i)
107 for (j = 0; j < n; ++j) {
108 /* compute the relative error c */
109 c = ELEM(A, an, i, j) - ELEM(B, bn, i, j);
110 if (c < 0.0)
111 c = -c;
112
113 c = c / ELEM(A, an, i, j);
114 if (c > EPSILON) {
115 std::cerr << "Strassen: Wrong answer! : " << c << '/' << EPSILON << std::endl;
116 // FAIL
117 return 0;
118 }
119 }
120
121 // SUCCESS
122 return 1;
123 }
124
125
126
127 /***********************************************************************
128 * Allocate a matrix of side n (therefore n^2 elements)
129 **********************************************************************/
alloc_matrix(int n)130 inline REAL *alloc_matrix(int n) {
131 return static_cast<REAL*>(malloc(n * n * sizeof(REAL)));
132 }
133
134
135 /***********************************************************************
136 * Naive sequential algorithm, for comparison purposes
137 **********************************************************************/
matrixmul(int n,REAL * A,int an,REAL * B,int bn,REAL * C,int cn)138 inline void matrixmul(int n, REAL *A, int an, REAL *B, int bn, REAL *C, int cn) {
139 int i, j, k;
140 REAL s;
141
142 for (i = 0; i < n; ++i)
143 {
144 for (j = 0; j < n; ++j)
145 {
146 s = 0.0;
147 for (k = 0; k < n; ++k) s += ELEM(A, an, i, k) * ELEM(B, bn, k, j);
148 ELEM(C, cn, i, j) = s;
149 }
150 }
151 }
152
153
154 /***********************************************************************
155 * Naive sequential strassen algorithm, for comparison purposes
156 **********************************************************************/
OptimizedStrassenMultiply_seq(REAL * C,REAL * A,REAL * B,unsigned MatrixSize,unsigned RowWidthC,unsigned RowWidthA,unsigned RowWidthB,int Depth)157 inline void OptimizedStrassenMultiply_seq(REAL *C, REAL *A, REAL *B, unsigned MatrixSize,
158 unsigned RowWidthC, unsigned RowWidthA, unsigned RowWidthB, int Depth)
159 {
160 unsigned QuadrantSize = MatrixSize >> 1; /* MatixSize / 2 */
161 unsigned QuadrantSizeInBytes = sizeof(REAL) * QuadrantSize * QuadrantSize
162 + 32;
163 unsigned Column, Row;
164
165 /************************************************************************
166 ** For each matrix A, B, and C, we'll want pointers to each quandrant
167 ** in the matrix. These quandrants will be addressed as follows:
168 ** -- --
169 ** | A11 A12 |
170 ** | |
171 ** | A21 A22 |
172 ** -- --
173 ************************************************************************/
174 REAL /* *A11, *B11, *C11, */ *A12, *B12, *C12,
175 *A21, *B21, *C21, *A22, *B22, *C22;
176
177 REAL *S1,*S2,*S3,*S4,*S5,*S6,*S7,*S8,*M2,*M5,*T1sMULT;
178 #define T2sMULT C22
179 #define NumberOfVariables 11
180
181 PTR TempMatrixOffset = 0;
182 PTR MatrixOffsetA = 0;
183 PTR MatrixOffsetB = 0;
184
185 char *Heap;
186 void *StartHeap;
187
188 /* Distance between the end of a matrix row and the start of the next row */
189 PTR RowIncrementA = ( RowWidthA - QuadrantSize ) << 3;
190 PTR RowIncrementB = ( RowWidthB - QuadrantSize ) << 3;
191 PTR RowIncrementC = ( RowWidthC - QuadrantSize ) << 3;
192
193 if (MatrixSize <= CUTOFF_SIZE) {
194 MultiplyByDivideAndConquer(C, A, B, MatrixSize, RowWidthC, RowWidthA, RowWidthB, 0);
195 return;
196 }
197
198 ///* Initialize quandrant matrices */
199 //#define A11 A
200 //#define B11 B
201 //#define C11 C
202
203 auto A11 = A;
204 auto B11 = B;
205 auto C11 = C;
206
207 A12 = A11 + QuadrantSize;
208 B12 = B11 + QuadrantSize;
209 C12 = C11 + QuadrantSize;
210 A21 = A + (RowWidthA * QuadrantSize);
211 B21 = B + (RowWidthB * QuadrantSize);
212 C21 = C + (RowWidthC * QuadrantSize);
213 A22 = A21 + QuadrantSize;
214 B22 = B21 + QuadrantSize;
215 C22 = C21 + QuadrantSize;
216
217 /* Allocate Heap Space Here */
218 StartHeap = Heap = static_cast<char*>(malloc(QuadrantSizeInBytes * NumberOfVariables));
219 /* ensure that heap is on cache boundary */
220 if ( ((PTR) Heap) & 31)
221 Heap = (char*) ( ((PTR) Heap) + 32 - ( ((PTR) Heap) & 31) );
222
223 /* Distribute the heap space over the variables */
224 S1 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
225 S2 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
226 S3 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
227 S4 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
228 S5 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
229 S6 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
230 S7 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
231 S8 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
232 M2 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
233 M5 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
234 T1sMULT = (REAL*) Heap; Heap += QuadrantSizeInBytes;
235
236 /***************************************************************************
237 ** Step through all columns row by row (vertically)
238 ** (jumps in memory by RowWidth => bad locality)
239 ** (but we want the best locality on the innermost loop)
240 ***************************************************************************/
241 for (Row = 0; Row < QuadrantSize; Row++) {
242
243 /*************************************************************************
244 ** Step through each row horizontally (addressing elements in each column)
245 ** (jumps linearly througn memory => good locality)
246 *************************************************************************/
247 for (Column = 0; Column < QuadrantSize; Column++) {
248
249 /***********************************************************
250 ** Within this loop, the following holds for MatrixOffset:
251 ** MatrixOffset = (Row * RowWidth) + Column
252 ** (note: that the unit of the offset is number of reals)
253 ***********************************************************/
254 /* Element of Global Matrix, such as A, B, C */
255 #define E(Matrix) (* (REAL*) ( ((PTR) Matrix) + TempMatrixOffset ) )
256 #define EA(Matrix) (* (REAL*) ( ((PTR) Matrix) + MatrixOffsetA ) )
257 #define EB(Matrix) (* (REAL*) ( ((PTR) Matrix) + MatrixOffsetB ) )
258
259 /* FIXME - may pay to expand these out - got higher speed-ups below */
260 /* S4 = A12 - ( S2 = ( S1 = A21 + A22 ) - A11 ) */
261 E(S4) = EA(A12) - ( E(S2) = ( E(S1) = EA(A21) + EA(A22) ) - EA(A11) );
262
263 /* S8 = (S6 = B22 - ( S5 = B12 - B11 ) ) - B21 */
264 E(S8) = ( E(S6) = EB(B22) - ( E(S5) = EB(B12) - EB(B11) ) ) - EB(B21);
265
266 /* S3 = A11 - A21 */
267 E(S3) = EA(A11) - EA(A21);
268
269 /* S7 = B22 - B12 */
270 E(S7) = EB(B22) - EB(B12);
271
272 TempMatrixOffset += sizeof(REAL);
273 MatrixOffsetA += sizeof(REAL);
274 MatrixOffsetB += sizeof(REAL);
275 } /* end row loop*/
276
277 MatrixOffsetA += RowIncrementA;
278 MatrixOffsetB += RowIncrementB;
279 } /* end column loop */
280
281 /* M2 = A11 x B11 */
282 OptimizedStrassenMultiply_seq(M2, A11, B11, QuadrantSize, QuadrantSize, RowWidthA, RowWidthB, Depth+1);
283
284 /* M5 = S1 * S5 */
285 OptimizedStrassenMultiply_seq(M5, S1, S5, QuadrantSize, QuadrantSize, QuadrantSize, QuadrantSize, Depth+1);
286
287 /* Step 1 of T1 = S2 x S6 + M2 */
288 OptimizedStrassenMultiply_seq(T1sMULT, S2, S6, QuadrantSize, QuadrantSize, QuadrantSize, QuadrantSize, Depth+1);
289
290 /* Step 1 of T2 = T1 + S3 x S7 */
291 OptimizedStrassenMultiply_seq(C22, S3, S7, QuadrantSize, RowWidthC /*FIXME*/, QuadrantSize, QuadrantSize, Depth+1);
292
293 /* Step 1 of C11 = M2 + A12 * B21 */
294 OptimizedStrassenMultiply_seq(C11, A12, B21, QuadrantSize, RowWidthC, RowWidthA, RowWidthB, Depth+1);
295
296 /* Step 1 of C12 = S4 x B22 + T1 + M5 */
297 OptimizedStrassenMultiply_seq(C12, S4, B22, QuadrantSize, RowWidthC, QuadrantSize, RowWidthB, Depth+1);
298
299 /* Step 1 of C21 = T2 - A22 * S8 */
300 OptimizedStrassenMultiply_seq(C21, A22, S8, QuadrantSize, RowWidthC, RowWidthA, QuadrantSize, Depth+1);
301
302 /***************************************************************************
303 ** Step through all columns row by row (vertically)
304 ** (jumps in memory by RowWidth => bad locality)
305 ** (but we want the best locality on the innermost loop)
306 ***************************************************************************/
307 for (Row = 0; Row < QuadrantSize; Row++) {
308 /*************************************************************************
309 ** Step through each row horizontally (addressing elements in each column)
310 ** (jumps linearly througn memory => good locality)
311 *************************************************************************/
312 for (Column = 0; Column < QuadrantSize; Column += 4) {
313 REAL LocalM5_0 = *(M5);
314 REAL LocalM5_1 = *(M5+1);
315 REAL LocalM5_2 = *(M5+2);
316 REAL LocalM5_3 = *(M5+3);
317 REAL LocalM2_0 = *(M2);
318 REAL LocalM2_1 = *(M2+1);
319 REAL LocalM2_2 = *(M2+2);
320 REAL LocalM2_3 = *(M2+3);
321 REAL T1_0 = *(T1sMULT) + LocalM2_0;
322 REAL T1_1 = *(T1sMULT+1) + LocalM2_1;
323 REAL T1_2 = *(T1sMULT+2) + LocalM2_2;
324 REAL T1_3 = *(T1sMULT+3) + LocalM2_3;
325 REAL T2_0 = *(C22) + T1_0;
326 REAL T2_1 = *(C22+1) + T1_1;
327 REAL T2_2 = *(C22+2) + T1_2;
328 REAL T2_3 = *(C22+3) + T1_3;
329 (*(C11)) += LocalM2_0;
330 (*(C11+1)) += LocalM2_1;
331 (*(C11+2)) += LocalM2_2;
332 (*(C11+3)) += LocalM2_3;
333 (*(C12)) += LocalM5_0 + T1_0;
334 (*(C12+1)) += LocalM5_1 + T1_1;
335 (*(C12+2)) += LocalM5_2 + T1_2;
336 (*(C12+3)) += LocalM5_3 + T1_3;
337 (*(C22)) = LocalM5_0 + T2_0;
338 (*(C22+1)) = LocalM5_1 + T2_1;
339 (*(C22+2)) = LocalM5_2 + T2_2;
340 (*(C22+3)) = LocalM5_3 + T2_3;
341 (*(C21 )) = (- *(C21 )) + T2_0;
342 (*(C21+1)) = (- *(C21+1)) + T2_1;
343 (*(C21+2)) = (- *(C21+2)) + T2_2;
344 (*(C21+3)) = (- *(C21+3)) + T2_3;
345 M5 += 4;
346 M2 += 4;
347 T1sMULT += 4;
348 C11 += 4;
349 C12 += 4;
350 C21 += 4;
351 C22 += 4;
352 }
353 C11 = (REAL*) ( ((PTR) C11 ) + RowIncrementC);
354 C12 = (REAL*) ( ((PTR) C12 ) + RowIncrementC);
355 C21 = (REAL*) ( ((PTR) C21 ) + RowIncrementC);
356 C22 = (REAL*) ( ((PTR) C22 ) + RowIncrementC);
357 }
358 free(StartHeap);
359 }
360
361
362
363
364
365 /*****************************************************************************
366 **
367 ** FastNaiveMatrixMultiply
368 **
369 ** For small to medium sized matrices A, B, and C of size
370 ** MatrixSize * MatrixSize this function performs the operation
371 ** C = A x B efficiently.
372 **
373 ** Note MatrixSize must be divisible by 8.
374 **
375 ** INPUT:
376 ** C = (*C WRITE) Address of top left element of matrix C.
377 ** A = (*A IS READ ONLY) Address of top left element of matrix A.
378 ** B = (*B IS READ ONLY) Address of top left element of matrix B.
379 ** MatrixSize = Size of matrices (for n*n matrix, MatrixSize = n)
380 ** RowWidthA = Number of elements in memory between A[x,y] and A[x,y+1]
381 ** RowWidthB = Number of elements in memory between B[x,y] and B[x,y+1]
382 ** RowWidthC = Number of elements in memory between C[x,y] and C[x,y+1]
383 **
384 ** OUTPUT:
385 ** C = (*C WRITE) Matrix C contains A x B. (Initial value of *C undefined.)
386 **
387 *****************************************************************************/
FastNaiveMatrixMultiply(REAL * C,REAL * A,REAL * B,unsigned MatrixSize,unsigned RowWidthC,unsigned RowWidthA,unsigned RowWidthB)388 inline void FastNaiveMatrixMultiply(REAL *C, REAL *A, REAL *B, unsigned MatrixSize,
389 unsigned RowWidthC, unsigned RowWidthA, unsigned RowWidthB) {
390 /* Assumes size of real is 8 bytes */
391 PTR RowWidthBInBytes = RowWidthB << 3;
392 PTR RowWidthAInBytes = RowWidthA << 3;
393 PTR MatrixWidthInBytes = MatrixSize << 3;
394 PTR RowIncrementC = ( RowWidthC - MatrixSize) << 3;
395 unsigned Horizontal, Vertical;
396
397 REAL *ARowStart = A;
398 for (Vertical = 0; Vertical < MatrixSize; Vertical++) {
399 for (Horizontal = 0; Horizontal < MatrixSize; Horizontal += 8) {
400 REAL *BColumnStart = B + Horizontal;
401 REAL FirstARowValue = *ARowStart++;
402
403 REAL Sum0 = FirstARowValue * (*BColumnStart);
404 REAL Sum1 = FirstARowValue * (*(BColumnStart+1));
405 REAL Sum2 = FirstARowValue * (*(BColumnStart+2));
406 REAL Sum3 = FirstARowValue * (*(BColumnStart+3));
407 REAL Sum4 = FirstARowValue * (*(BColumnStart+4));
408 REAL Sum5 = FirstARowValue * (*(BColumnStart+5));
409 REAL Sum6 = FirstARowValue * (*(BColumnStart+6));
410 REAL Sum7 = FirstARowValue * (*(BColumnStart+7));
411
412 unsigned Products;
413 for (Products = 1; Products < MatrixSize; Products++) {
414 REAL ARowValue = *ARowStart++;
415 BColumnStart = (REAL*) (((PTR) BColumnStart) + RowWidthBInBytes);
416 Sum0 += ARowValue * (*BColumnStart);
417 Sum1 += ARowValue * (*(BColumnStart+1));
418 Sum2 += ARowValue * (*(BColumnStart+2));
419 Sum3 += ARowValue * (*(BColumnStart+3));
420 Sum4 += ARowValue * (*(BColumnStart+4));
421 Sum5 += ARowValue * (*(BColumnStart+5));
422 Sum6 += ARowValue * (*(BColumnStart+6));
423 Sum7 += ARowValue * (*(BColumnStart+7));
424 }
425 ARowStart = (REAL*) ( ((PTR) ARowStart) - MatrixWidthInBytes);
426
427 *(C) = Sum0;
428 *(C+1) = Sum1;
429 *(C+2) = Sum2;
430 *(C+3) = Sum3;
431 *(C+4) = Sum4;
432 *(C+5) = Sum5;
433 *(C+6) = Sum6;
434 *(C+7) = Sum7;
435 C+=8;
436 }
437 ARowStart = (REAL*) ( ((PTR) ARowStart) + RowWidthAInBytes );
438 C = (REAL*) ( ((PTR) C) + RowIncrementC );
439 }
440 }
441
442
443 /*****************************************************************************
444 **
445 ** FastAdditiveNaiveMatrixMultiply
446 **
447 ** For small to medium sized matrices A, B, and C of size
448 ** MatrixSize * MatrixSize this function performs the operation
449 ** C += A x B efficiently.
450 **
451 ** Note MatrixSize must be divisible by 8.
452 **
453 ** INPUT:
454 ** C = (*C READ/WRITE) Address of top left element of matrix C.
455 ** A = (*A IS READ ONLY) Address of top left element of matrix A.
456 ** B = (*B IS READ ONLY) Address of top left element of matrix B.
457 ** MatrixSize = Size of matrices (for n*n matrix, MatrixSize = n)
458 ** RowWidthA = Number of elements in memory between A[x,y] and A[x,y+1]
459 ** RowWidthB = Number of elements in memory between B[x,y] and B[x,y+1]
460 ** RowWidthC = Number of elements in memory between C[x,y] and C[x,y+1]
461 **
462 ** OUTPUT:
463 ** C = (*C READ/WRITE) Matrix C contains C + A x B.
464 **
465 *****************************************************************************/
FastAdditiveNaiveMatrixMultiply(REAL * C,REAL * A,REAL * B,unsigned MatrixSize,unsigned RowWidthC,unsigned RowWidthA,unsigned RowWidthB)466 inline void FastAdditiveNaiveMatrixMultiply(REAL *C, REAL *A, REAL *B, unsigned MatrixSize,
467 unsigned RowWidthC, unsigned RowWidthA, unsigned RowWidthB) {
468 /* Assumes size of real is 8 bytes */
469 PTR RowWidthBInBytes = RowWidthB << 3;
470 PTR RowWidthAInBytes = RowWidthA << 3;
471 PTR MatrixWidthInBytes = MatrixSize << 3;
472 PTR RowIncrementC = ( RowWidthC - MatrixSize) << 3;
473 unsigned Horizontal, Vertical;
474
475 REAL *ARowStart = A;
476 for (Vertical = 0; Vertical < MatrixSize; Vertical++) {
477 for (Horizontal = 0; Horizontal < MatrixSize; Horizontal += 8) {
478 REAL *BColumnStart = B + Horizontal;
479
480 REAL Sum0 = *C;
481 REAL Sum1 = *(C+1);
482 REAL Sum2 = *(C+2);
483 REAL Sum3 = *(C+3);
484 REAL Sum4 = *(C+4);
485 REAL Sum5 = *(C+5);
486 REAL Sum6 = *(C+6);
487 REAL Sum7 = *(C+7);
488
489 unsigned Products;
490 for (Products = 0; Products < MatrixSize; Products++) {
491 REAL ARowValue = *ARowStart++;
492
493 Sum0 += ARowValue * (*BColumnStart);
494 Sum1 += ARowValue * (*(BColumnStart+1));
495 Sum2 += ARowValue * (*(BColumnStart+2));
496 Sum3 += ARowValue * (*(BColumnStart+3));
497 Sum4 += ARowValue * (*(BColumnStart+4));
498 Sum5 += ARowValue * (*(BColumnStart+5));
499 Sum6 += ARowValue * (*(BColumnStart+6));
500 Sum7 += ARowValue * (*(BColumnStart+7));
501
502 BColumnStart = (REAL*) (((PTR) BColumnStart) + RowWidthBInBytes);
503
504 }
505 ARowStart = (REAL*) ( ((PTR) ARowStart) - MatrixWidthInBytes);
506
507 *(C) = Sum0;
508 *(C+1) = Sum1;
509 *(C+2) = Sum2;
510 *(C+3) = Sum3;
511 *(C+4) = Sum4;
512 *(C+5) = Sum5;
513 *(C+6) = Sum6;
514 *(C+7) = Sum7;
515 C+=8;
516 }
517
518 ARowStart = (REAL*) ( ((PTR) ARowStart) + RowWidthAInBytes );
519 C = (REAL*) ( ((PTR) C) + RowIncrementC );
520 }
521 }
522
523
524
525 /*****************************************************************************
526 **
527 ** MultiplyByDivideAndConquer
528 **
529 ** For medium to medium-large (would you like fries with that) sized
530 ** matrices A, B, and C of size MatrixSize * MatrixSize this function
531 ** efficiently performs the operation
532 ** C = A x B (if AdditiveMode == 0)
533 ** C += A x B (if AdditiveMode != 0)
534 **
535 ** Note MatrixSize must be divisible by 16.
536 **
537 ** INPUT:
538 ** C = (*C READ/WRITE) Address of top left element of matrix C.
539 ** A = (*A IS READ ONLY) Address of top left element of matrix A.
540 ** B = (*B IS READ ONLY) Address of top left element of matrix B.
541 ** MatrixSize = Size of matrices (for n*n matrix, MatrixSize = n)
542 ** RowWidthA = Number of elements in memory between A[x,y] and A[x,y+1]
543 ** RowWidthB = Number of elements in memory between B[x,y] and B[x,y+1]
544 ** RowWidthC = Number of elements in memory between C[x,y] and C[x,y+1]
545 ** AdditiveMode = 0 if we want C = A x B, otherwise we'll do C += A x B
546 **
547 ** OUTPUT:
548 ** C (+)= A x B. (+ if AdditiveMode != 0)
549 **
550 *****************************************************************************/
MultiplyByDivideAndConquer(REAL * C,REAL * A,REAL * B,unsigned MatrixSize,unsigned RowWidthC,unsigned RowWidthA,unsigned RowWidthB,int AdditiveMode)551 inline void MultiplyByDivideAndConquer(REAL *C, REAL *A, REAL *B,
552 unsigned MatrixSize,
553 unsigned RowWidthC,
554 unsigned RowWidthA,
555 unsigned RowWidthB,
556 int AdditiveMode
557 )
558 {
559 #define A00 A
560 #define B00 B
561 #define C00 C
562 REAL *A01, *A10, *A11, *B01, *B10, *B11, *C01, *C10, *C11;
563 unsigned QuadrantSize = MatrixSize >> 1;
564
565 /* partition the matrix */
566 A01 = A00 + QuadrantSize;
567 A10 = A00 + RowWidthA * QuadrantSize;
568 A11 = A10 + QuadrantSize;
569
570 B01 = B00 + QuadrantSize;
571 B10 = B00 + RowWidthB * QuadrantSize;
572 B11 = B10 + QuadrantSize;
573
574 C01 = C00 + QuadrantSize;
575 C10 = C00 + RowWidthC * QuadrantSize;
576 C11 = C10 + QuadrantSize;
577
578 if (QuadrantSize > SizeAtWhichNaiveAlgorithmIsMoreEfficient) {
579
580 MultiplyByDivideAndConquer(C00, A00, B00, QuadrantSize,
581 RowWidthC, RowWidthA, RowWidthB,
582 AdditiveMode);
583
584 MultiplyByDivideAndConquer(C01, A00, B01, QuadrantSize,
585 RowWidthC, RowWidthA, RowWidthB,
586 AdditiveMode);
587
588 MultiplyByDivideAndConquer(C11, A10, B01, QuadrantSize,
589 RowWidthC, RowWidthA, RowWidthB,
590 AdditiveMode);
591
592 MultiplyByDivideAndConquer(C10, A10, B00, QuadrantSize,
593 RowWidthC, RowWidthA, RowWidthB,
594 AdditiveMode);
595
596 MultiplyByDivideAndConquer(C00, A01, B10, QuadrantSize,
597 RowWidthC, RowWidthA, RowWidthB,
598 1);
599
600 MultiplyByDivideAndConquer(C01, A01, B11, QuadrantSize,
601 RowWidthC, RowWidthA, RowWidthB,
602 1);
603
604 MultiplyByDivideAndConquer(C11, A11, B11, QuadrantSize,
605 RowWidthC, RowWidthA, RowWidthB,
606 1);
607
608 MultiplyByDivideAndConquer(C10, A11, B10, QuadrantSize,
609 RowWidthC, RowWidthA, RowWidthB,
610 1);
611
612 } else {
613
614 if (AdditiveMode) {
615 FastAdditiveNaiveMatrixMultiply(C00, A00, B00, QuadrantSize,
616 RowWidthC, RowWidthA, RowWidthB);
617
618 FastAdditiveNaiveMatrixMultiply(C01, A00, B01, QuadrantSize,
619 RowWidthC, RowWidthA, RowWidthB);
620
621 FastAdditiveNaiveMatrixMultiply(C11, A10, B01, QuadrantSize,
622 RowWidthC, RowWidthA, RowWidthB);
623
624 FastAdditiveNaiveMatrixMultiply(C10, A10, B00, QuadrantSize,
625 RowWidthC, RowWidthA, RowWidthB);
626
627 } else {
628
629 FastNaiveMatrixMultiply(C00, A00, B00, QuadrantSize,
630 RowWidthC, RowWidthA, RowWidthB);
631
632 FastNaiveMatrixMultiply(C01, A00, B01, QuadrantSize,
633 RowWidthC, RowWidthA, RowWidthB);
634
635 FastNaiveMatrixMultiply(C11, A10, B01, QuadrantSize,
636 RowWidthC, RowWidthA, RowWidthB);
637
638 FastNaiveMatrixMultiply(C10, A10, B00, QuadrantSize,
639 RowWidthC, RowWidthA, RowWidthB);
640 }
641
642 FastAdditiveNaiveMatrixMultiply(C00, A01, B10, QuadrantSize,
643 RowWidthC, RowWidthA, RowWidthB);
644
645 FastAdditiveNaiveMatrixMultiply(C01, A01, B11, QuadrantSize,
646 RowWidthC, RowWidthA, RowWidthB);
647
648 FastAdditiveNaiveMatrixMultiply(C11, A11, B11, QuadrantSize,
649 RowWidthC, RowWidthA, RowWidthB);
650
651 FastAdditiveNaiveMatrixMultiply(C10, A11, B10, QuadrantSize,
652 RowWidthC, RowWidthA, RowWidthB);
653 }
654 }
655
dump_matrix(double * C,const unsigned sz,std::string && fname)656 inline void dump_matrix(double *C, const unsigned sz, std::string&& fname) {
657 std::ofstream ofs {fname};
658
659 for(unsigned i=0; i<sz; i++) {
660 ofs << C[i] << ' ';
661 }
662 }
663
init_ABC()664 inline void init_ABC() {
665 MatrixA = alloc_matrix(MATRIX_SIZE);
666 MatrixB = alloc_matrix(MATRIX_SIZE);
667 MatrixC = alloc_matrix(MATRIX_SIZE);
668
669 ::srand(1);
670 init_matrix(MATRIX_SIZE, MatrixA, MATRIX_SIZE);
671 init_matrix(MATRIX_SIZE, MatrixB, MATRIX_SIZE);
672 }
673
674 std::chrono::microseconds measure_time_omp(unsigned, REAL *, REAL *, REAL *, int);
675 std::chrono::microseconds measure_time_tbb(unsigned, REAL *, REAL *, REAL *, int);
676 std::chrono::microseconds measure_time_taskflow(unsigned, REAL *, REAL *, REAL *, int);
677
678