1 /**********************************************************************************************/
2 /*  This program is part of the Barcelona OpenMP Tasks Suite                                  */
3 /*  Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion  */
4 /*  Copyright (C) 2009 Universitat Politecnica de Catalunya                                   */
5 /*                                                                                            */
6 /*  This program is free software; you can redistribute it and/or modify                      */
7 /*  it under the terms of the GNU General Public License as published by                      */
8 /*  the Free Software Foundation; either version 2 of the License, or                         */
9 /*  (at your option) any later version.                                                       */
10 /*                                                                                            */
11 /*  This program is distributed in the hope that it will be useful,                           */
12 /*  but WITHOUT ANY WARRANTY; without even the implied warranty of                            */
13 /*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the                             */
14 /*  GNU General Public License for more details.                                              */
15 /*                                                                                            */
16 /*  You should have received a copy of the GNU General Public License                         */
17 /*  along with this program; if not, write to the Free Software                               */
18 /*  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA            */
19 /**********************************************************************************************/
20 #include <iostream>
21 #include <random>
22 #include <chrono>
23 #include <cassert>
24 #include <fstream>
25 
26 #pragma once
27 
28 typedef double REAL;
29 typedef unsigned long PTR;
30 
31 /* MATRIX_SIZE must be a power of 2 and a multiple of 16 */
32 #define MATRIX_SIZE 1024
33 
34 /* Below this cut off strassen uses MultiplyByDivideAndConquer() algorithm */
35 //#define CUTOFF_SIZE 2*1024
36 #define CUTOFF_SIZE 64
37 
38 inline double *MatrixA, *MatrixB, *MatrixC;
39 
40 
41 /* ******************************************************************* */
42 /* STRASSEN APPLICATION CUT OFF's                                      */
43 /* ******************************************************************* */
44 /* Strassen uses three different functions to compute Matrix Multiply. */
45 /* Each of them is related to an application cut off value:            */
46 /*  - Initial algorithm: OptimizedStrassenMultiply()                   */
47 /*  - bots_app_cutoff_value: MultiplyByDivideAndConquer()              */
48 /*  - SizeAtWhichNaiveAlgorithmIsMoreEfficient: FastAdditiveNaiveMatrixMultiply() */
49 /* ******************************************************************* */
50 
51 /*FIXME: at the moment we use a constant value, change to parameter ???*/
52 /* Below this cut off  strassen uses FastAdditiveNaiveMatrixMultiply algorithm */
53 #define SizeAtWhichNaiveAlgorithmIsMoreEfficient 16
54 
55 /***********************************************************************
56  * maximum tolerable relative error (for the checking routine)
57  **********************************************************************/
58 #define EPSILON (1.0E-6)
59 /***********************************************************************
60  * Matrices are stored in row-major order; A is a pointer to
61  * the first element of the matrix, and an is the number of elements
62  * between two rows. This macro produces the element A[i,j]
63  * given A, an, i and j
64  **********************************************************************/
65 #define ELEM(A, an, i, j) (A[(i)*(an)+(j)])
66 
67 void FastNaiveMatrixMultiply(REAL *C, REAL *A, REAL *B, unsigned MatrixSize,
68      unsigned RowWidthC, unsigned RowWidthA, unsigned RowWidthB);
69 void FastAdditiveNaiveMatrixMultiply(REAL *C, REAL *A, REAL *B, unsigned MatrixSize,
70      unsigned RowWidthC, unsigned RowWidthA, unsigned RowWidthB);
71 void MultiplyByDivideAndConquer(REAL *C, REAL *A, REAL *B,
72              unsigned MatrixSize,
73              unsigned RowWidthC,
74              unsigned RowWidthA,
75              unsigned RowWidthB,
76              int AdditiveMode
77             );
78 void OptimizedStrassenMultiply_par(REAL *C, REAL *A, REAL *B, unsigned MatrixSize,
79      unsigned RowWidthC, unsigned RowWidthA, unsigned RowWidthB, int Depth);
80 REAL *alloc_matrix(int n);
81 
82 
83 
84 
85 
86 /************************************************************************
87  * Set an n by n matrix A to random values.  The distance between
88  * rows is an
89  ************************************************************************/
init_matrix(int n,REAL * A,int an)90 inline void init_matrix(int n, REAL *A, int an) {
91   int i, j;
92 
93   for (i = 0; i < n; ++i)
94     for (j = 0; j < n; ++j)
95       ELEM(A, an, i, j) = ((double) rand()) / (double) RAND_MAX;
96 }
97 
98 /************************************************************************
99  * Compare two matrices.  Print an error message if they differ by
100  * more than EPSILON.
101  ************************************************************************/
compare_matrix(int n,REAL * A,int an,REAL * B,int bn)102 inline int compare_matrix(int n, REAL *A, int an, REAL *B, int bn) {
103   int i, j;
104   REAL c;
105 
106   for (i = 0; i < n; ++i)
107     for (j = 0; j < n; ++j) {
108       /* compute the relative error c */
109       c = ELEM(A, an, i, j) - ELEM(B, bn, i, j);
110       if (c < 0.0)
111         c = -c;
112 
113       c = c / ELEM(A, an, i, j);
114       if (c > EPSILON) {
115         std::cerr << "Strassen: Wrong answer! : " << c << '/' << EPSILON << std::endl;
116         // FAIL
117         return 0;
118       }
119     }
120 
121   // SUCCESS
122   return 1;
123 }
124 
125 
126 
127 /***********************************************************************
128  * Allocate a matrix of side n (therefore n^2 elements)
129  **********************************************************************/
alloc_matrix(int n)130 inline REAL *alloc_matrix(int n) {
131   return static_cast<REAL*>(malloc(n * n * sizeof(REAL)));
132 }
133 
134 
135 /***********************************************************************
136  * Naive sequential algorithm, for comparison purposes
137  **********************************************************************/
matrixmul(int n,REAL * A,int an,REAL * B,int bn,REAL * C,int cn)138 inline void matrixmul(int n, REAL *A, int an, REAL *B, int bn, REAL *C, int cn) {
139   int i, j, k;
140   REAL s;
141 
142   for (i = 0; i < n; ++i)
143   {
144     for (j = 0; j < n; ++j)
145     {
146       s = 0.0;
147       for (k = 0; k < n; ++k) s += ELEM(A, an, i, k) * ELEM(B, bn, k, j);
148       ELEM(C, cn, i, j) = s;
149     }
150   }
151 }
152 
153 
154 /***********************************************************************
155  * Naive sequential strassen algorithm, for comparison purposes
156  **********************************************************************/
OptimizedStrassenMultiply_seq(REAL * C,REAL * A,REAL * B,unsigned MatrixSize,unsigned RowWidthC,unsigned RowWidthA,unsigned RowWidthB,int Depth)157 inline void OptimizedStrassenMultiply_seq(REAL *C, REAL *A, REAL *B, unsigned MatrixSize,
158      unsigned RowWidthC, unsigned RowWidthA, unsigned RowWidthB, int Depth)
159 {
160   unsigned QuadrantSize = MatrixSize >> 1; /* MatixSize / 2 */
161   unsigned QuadrantSizeInBytes = sizeof(REAL) * QuadrantSize * QuadrantSize
162                                  + 32;
163   unsigned Column, Row;
164 
165   /************************************************************************
166   ** For each matrix A, B, and C, we'll want pointers to each quandrant
167   ** in the matrix. These quandrants will be addressed as follows:
168   **  --        --
169   **  | A11  A12 |
170   **  |          |
171   **  | A21  A22 |
172   **  --        --
173   ************************************************************************/
174   REAL /* *A11, *B11, *C11, */ *A12, *B12, *C12,
175        *A21, *B21, *C21, *A22, *B22, *C22;
176 
177   REAL *S1,*S2,*S3,*S4,*S5,*S6,*S7,*S8,*M2,*M5,*T1sMULT;
178   #define T2sMULT C22
179   #define NumberOfVariables 11
180 
181   PTR TempMatrixOffset = 0;
182   PTR MatrixOffsetA = 0;
183   PTR MatrixOffsetB = 0;
184 
185   char *Heap;
186   void *StartHeap;
187 
188   /* Distance between the end of a matrix row and the start of the next row */
189   PTR RowIncrementA = ( RowWidthA - QuadrantSize ) << 3;
190   PTR RowIncrementB = ( RowWidthB - QuadrantSize ) << 3;
191   PTR RowIncrementC = ( RowWidthC - QuadrantSize ) << 3;
192 
193   if (MatrixSize <= CUTOFF_SIZE) {
194     MultiplyByDivideAndConquer(C, A, B, MatrixSize, RowWidthC, RowWidthA, RowWidthB, 0);
195     return;
196   }
197 
198   ///* Initialize quandrant matrices */
199   //#define A11 A
200   //#define B11 B
201   //#define C11 C
202 
203   auto A11 = A;
204   auto B11 = B;
205   auto C11 = C;
206 
207   A12 = A11 + QuadrantSize;
208   B12 = B11 + QuadrantSize;
209   C12 = C11 + QuadrantSize;
210   A21 = A + (RowWidthA * QuadrantSize);
211   B21 = B + (RowWidthB * QuadrantSize);
212   C21 = C + (RowWidthC * QuadrantSize);
213   A22 = A21 + QuadrantSize;
214   B22 = B21 + QuadrantSize;
215   C22 = C21 + QuadrantSize;
216 
217   /* Allocate Heap Space Here */
218   StartHeap = Heap = static_cast<char*>(malloc(QuadrantSizeInBytes * NumberOfVariables));
219   /* ensure that heap is on cache boundary */
220   if ( ((PTR) Heap) & 31)
221      Heap = (char*) ( ((PTR) Heap) + 32 - ( ((PTR) Heap) & 31) );
222 
223   /* Distribute the heap space over the variables */
224   S1 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
225   S2 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
226   S3 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
227   S4 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
228   S5 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
229   S6 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
230   S7 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
231   S8 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
232   M2 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
233   M5 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
234   T1sMULT = (REAL*) Heap; Heap += QuadrantSizeInBytes;
235 
236   /***************************************************************************
237   ** Step through all columns row by row (vertically)
238   ** (jumps in memory by RowWidth => bad locality)
239   ** (but we want the best locality on the innermost loop)
240   ***************************************************************************/
241   for (Row = 0; Row < QuadrantSize; Row++) {
242 
243     /*************************************************************************
244     ** Step through each row horizontally (addressing elements in each column)
245     ** (jumps linearly througn memory => good locality)
246     *************************************************************************/
247     for (Column = 0; Column < QuadrantSize; Column++) {
248 
249       /***********************************************************
250       ** Within this loop, the following holds for MatrixOffset:
251       ** MatrixOffset = (Row * RowWidth) + Column
252       ** (note: that the unit of the offset is number of reals)
253       ***********************************************************/
254       /* Element of Global Matrix, such as A, B, C */
255       #define E(Matrix)   (* (REAL*) ( ((PTR) Matrix) + TempMatrixOffset ) )
256       #define EA(Matrix)  (* (REAL*) ( ((PTR) Matrix) + MatrixOffsetA ) )
257       #define EB(Matrix)  (* (REAL*) ( ((PTR) Matrix) + MatrixOffsetB ) )
258 
259       /* FIXME - may pay to expand these out - got higher speed-ups below */
260       /* S4 = A12 - ( S2 = ( S1 = A21 + A22 ) - A11 ) */
261       E(S4) = EA(A12) - ( E(S2) = ( E(S1) = EA(A21) + EA(A22) ) - EA(A11) );
262 
263       /* S8 = (S6 = B22 - ( S5 = B12 - B11 ) ) - B21 */
264       E(S8) = ( E(S6) = EB(B22) - ( E(S5) = EB(B12) - EB(B11) ) ) - EB(B21);
265 
266       /* S3 = A11 - A21 */
267       E(S3) = EA(A11) - EA(A21);
268 
269       /* S7 = B22 - B12 */
270       E(S7) = EB(B22) - EB(B12);
271 
272       TempMatrixOffset += sizeof(REAL);
273       MatrixOffsetA += sizeof(REAL);
274       MatrixOffsetB += sizeof(REAL);
275     } /* end row loop*/
276 
277     MatrixOffsetA += RowIncrementA;
278     MatrixOffsetB += RowIncrementB;
279   } /* end column loop */
280 
281   /* M2 = A11 x B11 */
282   OptimizedStrassenMultiply_seq(M2, A11, B11, QuadrantSize, QuadrantSize, RowWidthA, RowWidthB, Depth+1);
283 
284   /* M5 = S1 * S5 */
285   OptimizedStrassenMultiply_seq(M5, S1, S5, QuadrantSize, QuadrantSize, QuadrantSize, QuadrantSize, Depth+1);
286 
287   /* Step 1 of T1 = S2 x S6 + M2 */
288   OptimizedStrassenMultiply_seq(T1sMULT, S2, S6,  QuadrantSize, QuadrantSize, QuadrantSize, QuadrantSize, Depth+1);
289 
290   /* Step 1 of T2 = T1 + S3 x S7 */
291   OptimizedStrassenMultiply_seq(C22, S3, S7, QuadrantSize, RowWidthC /*FIXME*/, QuadrantSize, QuadrantSize, Depth+1);
292 
293   /* Step 1 of C11 = M2 + A12 * B21 */
294   OptimizedStrassenMultiply_seq(C11, A12, B21, QuadrantSize, RowWidthC, RowWidthA, RowWidthB, Depth+1);
295 
296   /* Step 1 of C12 = S4 x B22 + T1 + M5 */
297   OptimizedStrassenMultiply_seq(C12, S4, B22, QuadrantSize, RowWidthC, QuadrantSize, RowWidthB, Depth+1);
298 
299   /* Step 1 of C21 = T2 - A22 * S8 */
300   OptimizedStrassenMultiply_seq(C21, A22, S8, QuadrantSize, RowWidthC, RowWidthA, QuadrantSize, Depth+1);
301 
302   /***************************************************************************
303   ** Step through all columns row by row (vertically)
304   ** (jumps in memory by RowWidth => bad locality)
305   ** (but we want the best locality on the innermost loop)
306   ***************************************************************************/
307   for (Row = 0; Row < QuadrantSize; Row++) {
308     /*************************************************************************
309     ** Step through each row horizontally (addressing elements in each column)
310     ** (jumps linearly througn memory => good locality)
311     *************************************************************************/
312     for (Column = 0; Column < QuadrantSize; Column += 4) {
313       REAL LocalM5_0 = *(M5);
314       REAL LocalM5_1 = *(M5+1);
315       REAL LocalM5_2 = *(M5+2);
316       REAL LocalM5_3 = *(M5+3);
317       REAL LocalM2_0 = *(M2);
318       REAL LocalM2_1 = *(M2+1);
319       REAL LocalM2_2 = *(M2+2);
320       REAL LocalM2_3 = *(M2+3);
321       REAL T1_0 = *(T1sMULT) + LocalM2_0;
322       REAL T1_1 = *(T1sMULT+1) + LocalM2_1;
323       REAL T1_2 = *(T1sMULT+2) + LocalM2_2;
324       REAL T1_3 = *(T1sMULT+3) + LocalM2_3;
325       REAL T2_0 = *(C22) + T1_0;
326       REAL T2_1 = *(C22+1) + T1_1;
327       REAL T2_2 = *(C22+2) + T1_2;
328       REAL T2_3 = *(C22+3) + T1_3;
329       (*(C11))   += LocalM2_0;
330       (*(C11+1)) += LocalM2_1;
331       (*(C11+2)) += LocalM2_2;
332       (*(C11+3)) += LocalM2_3;
333       (*(C12))   += LocalM5_0 + T1_0;
334       (*(C12+1)) += LocalM5_1 + T1_1;
335       (*(C12+2)) += LocalM5_2 + T1_2;
336       (*(C12+3)) += LocalM5_3 + T1_3;
337       (*(C22))   = LocalM5_0 + T2_0;
338       (*(C22+1)) = LocalM5_1 + T2_1;
339       (*(C22+2)) = LocalM5_2 + T2_2;
340       (*(C22+3)) = LocalM5_3 + T2_3;
341       (*(C21  )) = (- *(C21  )) + T2_0;
342       (*(C21+1)) = (- *(C21+1)) + T2_1;
343       (*(C21+2)) = (- *(C21+2)) + T2_2;
344       (*(C21+3)) = (- *(C21+3)) + T2_3;
345       M5 += 4;
346       M2 += 4;
347       T1sMULT += 4;
348       C11 += 4;
349       C12 += 4;
350       C21 += 4;
351       C22 += 4;
352     }
353     C11 = (REAL*) ( ((PTR) C11 ) + RowIncrementC);
354     C12 = (REAL*) ( ((PTR) C12 ) + RowIncrementC);
355     C21 = (REAL*) ( ((PTR) C21 ) + RowIncrementC);
356     C22 = (REAL*) ( ((PTR) C22 ) + RowIncrementC);
357   }
358   free(StartHeap);
359 }
360 
361 
362 
363 
364 
365 /*****************************************************************************
366 **
367 ** FastNaiveMatrixMultiply
368 **
369 ** For small to medium sized matrices A, B, and C of size
370 ** MatrixSize * MatrixSize this function performs the operation
371 ** C = A x B efficiently.
372 **
373 ** Note MatrixSize must be divisible by 8.
374 **
375 ** INPUT:
376 **    C = (*C WRITE) Address of top left element of matrix C.
377 **    A = (*A IS READ ONLY) Address of top left element of matrix A.
378 **    B = (*B IS READ ONLY) Address of top left element of matrix B.
379 **    MatrixSize = Size of matrices (for n*n matrix, MatrixSize = n)
380 **    RowWidthA = Number of elements in memory between A[x,y] and A[x,y+1]
381 **    RowWidthB = Number of elements in memory between B[x,y] and B[x,y+1]
382 **    RowWidthC = Number of elements in memory between C[x,y] and C[x,y+1]
383 **
384 ** OUTPUT:
385 **    C = (*C WRITE) Matrix C contains A x B. (Initial value of *C undefined.)
386 **
387 *****************************************************************************/
FastNaiveMatrixMultiply(REAL * C,REAL * A,REAL * B,unsigned MatrixSize,unsigned RowWidthC,unsigned RowWidthA,unsigned RowWidthB)388 inline void FastNaiveMatrixMultiply(REAL *C, REAL *A, REAL *B, unsigned MatrixSize,
389     unsigned RowWidthC, unsigned RowWidthA, unsigned RowWidthB) {
390   /* Assumes size of real is 8 bytes */
391   PTR RowWidthBInBytes = RowWidthB  << 3;
392   PTR RowWidthAInBytes = RowWidthA << 3;
393   PTR MatrixWidthInBytes = MatrixSize << 3;
394   PTR RowIncrementC = ( RowWidthC - MatrixSize) << 3;
395   unsigned Horizontal, Vertical;
396 
397   REAL *ARowStart = A;
398   for (Vertical = 0; Vertical < MatrixSize; Vertical++) {
399     for (Horizontal = 0; Horizontal < MatrixSize; Horizontal += 8) {
400       REAL *BColumnStart = B + Horizontal;
401       REAL FirstARowValue = *ARowStart++;
402 
403       REAL Sum0 = FirstARowValue * (*BColumnStart);
404       REAL Sum1 = FirstARowValue * (*(BColumnStart+1));
405       REAL Sum2 = FirstARowValue * (*(BColumnStart+2));
406       REAL Sum3 = FirstARowValue * (*(BColumnStart+3));
407       REAL Sum4 = FirstARowValue * (*(BColumnStart+4));
408       REAL Sum5 = FirstARowValue * (*(BColumnStart+5));
409       REAL Sum6 = FirstARowValue * (*(BColumnStart+6));
410       REAL Sum7 = FirstARowValue * (*(BColumnStart+7));
411 
412       unsigned Products;
413       for (Products = 1; Products < MatrixSize; Products++) {
414         REAL ARowValue = *ARowStart++;
415         BColumnStart = (REAL*) (((PTR) BColumnStart) + RowWidthBInBytes);
416         Sum0 += ARowValue * (*BColumnStart);
417         Sum1 += ARowValue * (*(BColumnStart+1));
418         Sum2 += ARowValue * (*(BColumnStart+2));
419         Sum3 += ARowValue * (*(BColumnStart+3));
420         Sum4 += ARowValue * (*(BColumnStart+4));
421         Sum5 += ARowValue * (*(BColumnStart+5));
422         Sum6 += ARowValue * (*(BColumnStart+6));
423         Sum7 += ARowValue * (*(BColumnStart+7));
424       }
425       ARowStart = (REAL*) ( ((PTR) ARowStart) - MatrixWidthInBytes);
426 
427       *(C) = Sum0;
428       *(C+1) = Sum1;
429       *(C+2) = Sum2;
430       *(C+3) = Sum3;
431       *(C+4) = Sum4;
432       *(C+5) = Sum5;
433       *(C+6) = Sum6;
434       *(C+7) = Sum7;
435       C+=8;
436     }
437     ARowStart = (REAL*) ( ((PTR) ARowStart) + RowWidthAInBytes );
438     C = (REAL*) ( ((PTR) C) + RowIncrementC );
439   }
440 }
441 
442 
443 /*****************************************************************************
444 **
445 ** FastAdditiveNaiveMatrixMultiply
446 **
447 ** For small to medium sized matrices A, B, and C of size
448 ** MatrixSize * MatrixSize this function performs the operation
449 ** C += A x B efficiently.
450 **
451 ** Note MatrixSize must be divisible by 8.
452 **
453 ** INPUT:
454 **    C = (*C READ/WRITE) Address of top left element of matrix C.
455 **    A = (*A IS READ ONLY) Address of top left element of matrix A.
456 **    B = (*B IS READ ONLY) Address of top left element of matrix B.
457 **    MatrixSize = Size of matrices (for n*n matrix, MatrixSize = n)
458 **    RowWidthA = Number of elements in memory between A[x,y] and A[x,y+1]
459 **    RowWidthB = Number of elements in memory between B[x,y] and B[x,y+1]
460 **    RowWidthC = Number of elements in memory between C[x,y] and C[x,y+1]
461 **
462 ** OUTPUT:
463 **    C = (*C READ/WRITE) Matrix C contains C + A x B.
464 **
465 *****************************************************************************/
FastAdditiveNaiveMatrixMultiply(REAL * C,REAL * A,REAL * B,unsigned MatrixSize,unsigned RowWidthC,unsigned RowWidthA,unsigned RowWidthB)466 inline void FastAdditiveNaiveMatrixMultiply(REAL *C, REAL *A, REAL *B, unsigned MatrixSize,
467     unsigned RowWidthC, unsigned RowWidthA, unsigned RowWidthB) {
468   /* Assumes size of real is 8 bytes */
469   PTR RowWidthBInBytes = RowWidthB  << 3;
470   PTR RowWidthAInBytes = RowWidthA << 3;
471   PTR MatrixWidthInBytes = MatrixSize << 3;
472   PTR RowIncrementC = ( RowWidthC - MatrixSize) << 3;
473   unsigned Horizontal, Vertical;
474 
475   REAL *ARowStart = A;
476   for (Vertical = 0; Vertical < MatrixSize; Vertical++) {
477     for (Horizontal = 0; Horizontal < MatrixSize; Horizontal += 8) {
478       REAL *BColumnStart = B + Horizontal;
479 
480       REAL Sum0 = *C;
481       REAL Sum1 = *(C+1);
482       REAL Sum2 = *(C+2);
483       REAL Sum3 = *(C+3);
484       REAL Sum4 = *(C+4);
485       REAL Sum5 = *(C+5);
486       REAL Sum6 = *(C+6);
487       REAL Sum7 = *(C+7);
488 
489       unsigned Products;
490       for (Products = 0; Products < MatrixSize; Products++) {
491         REAL ARowValue = *ARowStart++;
492 
493         Sum0 += ARowValue * (*BColumnStart);
494         Sum1 += ARowValue * (*(BColumnStart+1));
495         Sum2 += ARowValue * (*(BColumnStart+2));
496         Sum3 += ARowValue * (*(BColumnStart+3));
497         Sum4 += ARowValue * (*(BColumnStart+4));
498         Sum5 += ARowValue * (*(BColumnStart+5));
499         Sum6 += ARowValue * (*(BColumnStart+6));
500         Sum7 += ARowValue * (*(BColumnStart+7));
501 
502         BColumnStart = (REAL*) (((PTR) BColumnStart) + RowWidthBInBytes);
503 
504       }
505       ARowStart = (REAL*) ( ((PTR) ARowStart) - MatrixWidthInBytes);
506 
507       *(C) = Sum0;
508       *(C+1) = Sum1;
509       *(C+2) = Sum2;
510       *(C+3) = Sum3;
511       *(C+4) = Sum4;
512       *(C+5) = Sum5;
513       *(C+6) = Sum6;
514       *(C+7) = Sum7;
515       C+=8;
516     }
517 
518     ARowStart = (REAL*) ( ((PTR) ARowStart) + RowWidthAInBytes );
519     C = (REAL*) ( ((PTR) C) + RowIncrementC );
520   }
521 }
522 
523 
524 
525 /*****************************************************************************
526 **
527 ** MultiplyByDivideAndConquer
528 **
529 ** For medium to medium-large (would you like fries with that) sized
530 ** matrices A, B, and C of size MatrixSize * MatrixSize this function
531 ** efficiently performs the operation
532 **    C  = A x B (if AdditiveMode == 0)
533 **    C += A x B (if AdditiveMode != 0)
534 **
535 ** Note MatrixSize must be divisible by 16.
536 **
537 ** INPUT:
538 **    C = (*C READ/WRITE) Address of top left element of matrix C.
539 **    A = (*A IS READ ONLY) Address of top left element of matrix A.
540 **    B = (*B IS READ ONLY) Address of top left element of matrix B.
541 **    MatrixSize = Size of matrices (for n*n matrix, MatrixSize = n)
542 **    RowWidthA = Number of elements in memory between A[x,y] and A[x,y+1]
543 **    RowWidthB = Number of elements in memory between B[x,y] and B[x,y+1]
544 **    RowWidthC = Number of elements in memory between C[x,y] and C[x,y+1]
545 **    AdditiveMode = 0 if we want C = A x B, otherwise we'll do C += A x B
546 **
547 ** OUTPUT:
548 **    C (+)= A x B. (+ if AdditiveMode != 0)
549 **
550 *****************************************************************************/
MultiplyByDivideAndConquer(REAL * C,REAL * A,REAL * B,unsigned MatrixSize,unsigned RowWidthC,unsigned RowWidthA,unsigned RowWidthB,int AdditiveMode)551 inline void MultiplyByDivideAndConquer(REAL *C, REAL *A, REAL *B,
552 				     unsigned MatrixSize,
553 				     unsigned RowWidthC,
554 				     unsigned RowWidthA,
555 				     unsigned RowWidthB,
556 				     int AdditiveMode
557 				    )
558 {
559   #define A00 A
560   #define B00 B
561   #define C00 C
562   REAL  *A01, *A10, *A11, *B01, *B10, *B11, *C01, *C10, *C11;
563   unsigned QuadrantSize = MatrixSize >> 1;
564 
565   /* partition the matrix */
566   A01 = A00 + QuadrantSize;
567   A10 = A00 + RowWidthA * QuadrantSize;
568   A11 = A10 + QuadrantSize;
569 
570   B01 = B00 + QuadrantSize;
571   B10 = B00 + RowWidthB * QuadrantSize;
572   B11 = B10 + QuadrantSize;
573 
574   C01 = C00 + QuadrantSize;
575   C10 = C00 + RowWidthC * QuadrantSize;
576   C11 = C10 + QuadrantSize;
577 
578   if (QuadrantSize > SizeAtWhichNaiveAlgorithmIsMoreEfficient) {
579 
580     MultiplyByDivideAndConquer(C00, A00, B00, QuadrantSize,
581 				     RowWidthC, RowWidthA, RowWidthB,
582 				     AdditiveMode);
583 
584     MultiplyByDivideAndConquer(C01, A00, B01, QuadrantSize,
585 				     RowWidthC, RowWidthA, RowWidthB,
586 				     AdditiveMode);
587 
588     MultiplyByDivideAndConquer(C11, A10, B01, QuadrantSize,
589 				     RowWidthC, RowWidthA, RowWidthB,
590 				     AdditiveMode);
591 
592     MultiplyByDivideAndConquer(C10, A10, B00, QuadrantSize,
593 				     RowWidthC, RowWidthA, RowWidthB,
594 				     AdditiveMode);
595 
596     MultiplyByDivideAndConquer(C00, A01, B10, QuadrantSize,
597 				     RowWidthC, RowWidthA, RowWidthB,
598 				     1);
599 
600     MultiplyByDivideAndConquer(C01, A01, B11, QuadrantSize,
601 				     RowWidthC, RowWidthA, RowWidthB,
602 				     1);
603 
604     MultiplyByDivideAndConquer(C11, A11, B11, QuadrantSize,
605 				     RowWidthC, RowWidthA, RowWidthB,
606 				     1);
607 
608     MultiplyByDivideAndConquer(C10, A11, B10, QuadrantSize,
609 				     RowWidthC, RowWidthA, RowWidthB,
610 				     1);
611 
612   } else {
613 
614     if (AdditiveMode) {
615       FastAdditiveNaiveMatrixMultiply(C00, A00, B00, QuadrantSize,
616 			      RowWidthC, RowWidthA, RowWidthB);
617 
618       FastAdditiveNaiveMatrixMultiply(C01, A00, B01, QuadrantSize,
619 			      RowWidthC, RowWidthA, RowWidthB);
620 
621       FastAdditiveNaiveMatrixMultiply(C11, A10, B01, QuadrantSize,
622 			      RowWidthC, RowWidthA, RowWidthB);
623 
624       FastAdditiveNaiveMatrixMultiply(C10, A10, B00, QuadrantSize,
625 			      RowWidthC, RowWidthA, RowWidthB);
626 
627     } else {
628 
629       FastNaiveMatrixMultiply(C00, A00, B00, QuadrantSize,
630 			      RowWidthC, RowWidthA, RowWidthB);
631 
632       FastNaiveMatrixMultiply(C01, A00, B01, QuadrantSize,
633 			      RowWidthC, RowWidthA, RowWidthB);
634 
635       FastNaiveMatrixMultiply(C11, A10, B01, QuadrantSize,
636 			      RowWidthC, RowWidthA, RowWidthB);
637 
638       FastNaiveMatrixMultiply(C10, A10, B00, QuadrantSize,
639 			      RowWidthC, RowWidthA, RowWidthB);
640     }
641 
642     FastAdditiveNaiveMatrixMultiply(C00, A01, B10, QuadrantSize,
643 				    RowWidthC, RowWidthA, RowWidthB);
644 
645     FastAdditiveNaiveMatrixMultiply(C01, A01, B11, QuadrantSize,
646 				    RowWidthC, RowWidthA, RowWidthB);
647 
648     FastAdditiveNaiveMatrixMultiply(C11, A11, B11, QuadrantSize,
649 				    RowWidthC, RowWidthA, RowWidthB);
650 
651     FastAdditiveNaiveMatrixMultiply(C10, A11, B10, QuadrantSize,
652 				    RowWidthC, RowWidthA, RowWidthB);
653   }
654 }
655 
dump_matrix(double * C,const unsigned sz,std::string && fname)656 inline void dump_matrix(double *C, const unsigned sz, std::string&& fname) {
657   std::ofstream ofs {fname};
658 
659   for(unsigned i=0; i<sz; i++) {
660     ofs << C[i] << ' ';
661   }
662 }
663 
init_ABC()664 inline void init_ABC() {
665   MatrixA = alloc_matrix(MATRIX_SIZE);
666   MatrixB = alloc_matrix(MATRIX_SIZE);
667   MatrixC = alloc_matrix(MATRIX_SIZE);
668 
669   ::srand(1);
670   init_matrix(MATRIX_SIZE, MatrixA, MATRIX_SIZE);
671   init_matrix(MATRIX_SIZE, MatrixB, MATRIX_SIZE);
672 }
673 
674 std::chrono::microseconds measure_time_omp(unsigned, REAL *, REAL *, REAL *, int);
675 std::chrono::microseconds measure_time_tbb(unsigned, REAL *, REAL *, REAL *, int);
676 std::chrono::microseconds measure_time_taskflow(unsigned, REAL *, REAL *, REAL *, int);
677 
678