1 #include <taskflow/taskflow.hpp>
2 #include "strassen.hpp"
3 
OptimizedStrassenMultiply_tf(REAL * C,REAL * A,REAL * B,unsigned MatrixSize,unsigned RowWidthC,unsigned RowWidthA,unsigned RowWidthB,int Depth,tf::Subflow & subflow)4 void OptimizedStrassenMultiply_tf(
5   REAL *C, REAL *A, REAL *B, unsigned MatrixSize,
6   unsigned RowWidthC, unsigned RowWidthA, unsigned RowWidthB, int Depth, tf::Subflow& subflow)
7 {
8   unsigned QuadrantSize = MatrixSize >> 1; /* MatixSize / 2 */
9   unsigned QuadrantSizeInBytes = sizeof(REAL) * QuadrantSize * QuadrantSize + 32;
10   unsigned Column, Row;
11 
12   /************************************************************************
13   ** For each matrix A, B, and C, we'll want pointers to each quandrant
14   ** in the matrix. These quandrants will be addressed as follows:
15   **  --        --
16   **  | A11  A12 |
17   **  |          |
18   **  | A21  A22 |
19   **  --        --
20   ************************************************************************/
21   REAL /* *A11, *B11, *C11, */ *A12, *B12, *C12,
22        *A21, *B21, *C21, *A22, *B22, *C22;
23 
24   REAL *S1,*S2,*S3,*S4,*S5,*S6,*S7,*S8,*M2,*M5,*T1sMULT;
25   #define T2sMULT C22
26   #define NumberOfVariables 11
27 
28   PTR TempMatrixOffset = 0;
29   PTR MatrixOffsetA = 0;
30   PTR MatrixOffsetB = 0;
31 
32   char *Heap;
33   void *StartHeap;
34 
35   /* Distance between the end of a matrix row and the start of the next row */
36   PTR RowIncrementA = ( RowWidthA - QuadrantSize ) << 3;
37   PTR RowIncrementB = ( RowWidthB - QuadrantSize ) << 3;
38   PTR RowIncrementC = ( RowWidthC - QuadrantSize ) << 3;
39 
40   if (MatrixSize <= CUTOFF_SIZE) {
41     MultiplyByDivideAndConquer(C, A, B, MatrixSize, RowWidthC, RowWidthA, RowWidthB, 0);
42     return;
43   }
44 
45   /* Initialize quandrant matrices */
46   #define A11 A
47   #define B11 B
48   #define C11 C
49   A12 = A11 + QuadrantSize;
50   B12 = B11 + QuadrantSize;
51   C12 = C11 + QuadrantSize;
52   A21 = A + (RowWidthA * QuadrantSize);
53   B21 = B + (RowWidthB * QuadrantSize);
54   C21 = C + (RowWidthC * QuadrantSize);
55   A22 = A21 + QuadrantSize;
56   B22 = B21 + QuadrantSize;
57   C22 = C21 + QuadrantSize;
58 
59   /* Allocate Heap Space Here */
60   Heap = static_cast<char*>(malloc(QuadrantSizeInBytes * NumberOfVariables));
61   StartHeap = Heap;
62 
63   /* ensure that heap is on cache boundary */
64   if ( ((PTR) Heap) & 31)
65      Heap = (char*) ( ((PTR) Heap) + 32 - ( ((PTR) Heap) & 31) );
66 
67   /* Distribute the heap space over the variables */
68   S1 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
69   S2 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
70   S3 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
71   S4 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
72   S5 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
73   S6 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
74   S7 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
75   S8 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
76   M2 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
77   M5 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
78   T1sMULT = (REAL*) Heap; Heap += QuadrantSizeInBytes;
79 
80   /***************************************************************************
81   ** Step through all columns row by row (vertically)
82   ** (jumps in memory by RowWidth => bad locality)
83   ** (but we want the best locality on the innermost loop)
84   ***************************************************************************/
85   for (Row = 0; Row < QuadrantSize; Row++) {
86 
87     /*************************************************************************
88     ** Step through each row horizontally (addressing elements in each column)
89     ** (jumps linearly througn memory => good locality)
90     *************************************************************************/
91     for (Column = 0; Column < QuadrantSize; Column++) {
92 
93       /***********************************************************
94       ** Within this loop, the following holds for MatrixOffset:
95       ** MatrixOffset = (Row * RowWidth) + Column
96       ** (note: that the unit of the offset is number of reals)
97       ***********************************************************/
98       /* Element of Global Matrix, such as A, B, C */
99       #define E(Matrix)   (* (REAL*) ( ((PTR) Matrix) + TempMatrixOffset ) )
100       #define EA(Matrix)  (* (REAL*) ( ((PTR) Matrix) + MatrixOffsetA ) )
101       #define EB(Matrix)  (* (REAL*) ( ((PTR) Matrix) + MatrixOffsetB ) )
102 
103       /* FIXME - may pay to expand these out - got higher speed-ups below */
104       /* S4 = A12 - ( S2 = ( S1 = A21 + A22 ) - A11 ) */
105       E(S4) = EA(A12) - ( E(S2) = ( E(S1) = EA(A21) + EA(A22) ) - EA(A11) );
106 
107       /* S8 = (S6 = B22 - ( S5 = B12 - B11 ) ) - B21 */
108       E(S8) = ( E(S6) = EB(B22) - ( E(S5) = EB(B12) - EB(B11) ) ) - EB(B21);
109 
110       /* S3 = A11 - A21 */
111       E(S3) = EA(A11) - EA(A21);
112 
113       /* S7 = B22 - B12 */
114       E(S7) = EB(B22) - EB(B12);
115 
116       TempMatrixOffset += sizeof(REAL);
117       MatrixOffsetA += sizeof(REAL);
118       MatrixOffsetB += sizeof(REAL);
119     } /* end row loop*/
120 
121     MatrixOffsetA += RowIncrementA;
122     MatrixOffsetB += RowIncrementB;
123   } /* end column loop */
124 
125   std::vector<tf::Task> tasks;
126 
127   /* M2 = A11 x B11 */
128   tasks.emplace_back(subflow.emplace([=](auto& subflow) {
129     OptimizedStrassenMultiply_tf(M2, A11, B11, QuadrantSize, QuadrantSize, RowWidthA, RowWidthB, Depth+1, subflow);
130   }));
131 
132   /* M5 = S1 * S5 */
133   tasks.emplace_back(subflow.emplace([=](auto &subflow) {
134     OptimizedStrassenMultiply_tf(M5, S1, S5, QuadrantSize, QuadrantSize, QuadrantSize, QuadrantSize, Depth+1, subflow);
135   }));
136 
137   /* Step 1 of T1 = S2 x S6 + M2 */
138   tasks.emplace_back(subflow.emplace([=](auto& subflow){
139     OptimizedStrassenMultiply_tf(T1sMULT, S2, S6,  QuadrantSize, QuadrantSize, QuadrantSize, QuadrantSize, Depth+1, subflow);
140   }));
141 
142   /* Step 1 of T2 = T1 + S3 x S7 */
143   tasks.emplace_back(subflow.emplace([=](auto &subflow){
144     OptimizedStrassenMultiply_tf(C22, S3, S7, QuadrantSize, RowWidthC /*FIXME*/, QuadrantSize, QuadrantSize, Depth+1, subflow);
145   }));
146 
147   /* Step 1 of C11 = M2 + A12 * B21 */
148   tasks.emplace_back(subflow.emplace([=](auto &subflow){
149     OptimizedStrassenMultiply_tf(C11, A12, B21, QuadrantSize, RowWidthC, RowWidthA, RowWidthB, Depth+1, subflow);
150   }));
151 
152   /* Step 1 of C12 = S4 x B22 + T1 + M5 */
153   tasks.emplace_back(subflow.emplace([=](auto &subflow){
154     OptimizedStrassenMultiply_tf(C12, S4, B22, QuadrantSize, RowWidthC, QuadrantSize, RowWidthB, Depth+1, subflow);
155   }));
156 
157   /* Step 1 of C21 = T2 - A22 * S8 */
158   tasks.emplace_back(subflow.emplace([=](auto &subflow){
159     OptimizedStrassenMultiply_tf(C21, A22, S8, QuadrantSize, RowWidthC, RowWidthA, QuadrantSize, Depth+1, subflow);
160   }));
161 
162   /**********************************************
163   ** Synchronization Point
164   **********************************************/
165   /***************************************************************************
166   ** Step through all columns row by row (vertically)
167   ** (jumps in memory by RowWidth => bad locality)
168   ** (but we want the best locality on the innermost loop)
169   ***************************************************************************/
170 
171   auto end_task = subflow.emplace([=]() mutable {
172     for (auto Row = 0u; Row < QuadrantSize; Row++) {
173       /*************************************************************************
174       ** Step through each row horizontally (addressing elements in each column)
175       ** (jumps linearly througn memory => good locality)
176       *************************************************************************/
177       for (auto Column = 0u; Column < QuadrantSize; Column += 4) {
178         REAL LocalM5_0 = *(M5);
179         REAL LocalM5_1 = *(M5+1);
180         REAL LocalM5_2 = *(M5+2);
181         REAL LocalM5_3 = *(M5+3);
182         REAL LocalM2_0 = *(M2);
183         REAL LocalM2_1 = *(M2+1);
184         REAL LocalM2_2 = *(M2+2);
185         REAL LocalM2_3 = *(M2+3);
186         REAL T1_0 = *(T1sMULT) + LocalM2_0;
187         REAL T1_1 = *(T1sMULT+1) + LocalM2_1;
188         REAL T1_2 = *(T1sMULT+2) + LocalM2_2;
189         REAL T1_3 = *(T1sMULT+3) + LocalM2_3;
190         REAL T2_0 = *(C22) + T1_0;
191         REAL T2_1 = *(C22+1) + T1_1;
192         REAL T2_2 = *(C22+2) + T1_2;
193         REAL T2_3 = *(C22+3) + T1_3;
194         (*(C11))   += LocalM2_0;
195         (*(C11+1)) += LocalM2_1;
196         (*(C11+2)) += LocalM2_2;
197         (*(C11+3)) += LocalM2_3;
198         (*(C12))   += LocalM5_0 + T1_0;
199         (*(C12+1)) += LocalM5_1 + T1_1;
200         (*(C12+2)) += LocalM5_2 + T1_2;
201         (*(C12+3)) += LocalM5_3 + T1_3;
202         (*(C22))   = LocalM5_0 + T2_0;
203         (*(C22+1)) = LocalM5_1 + T2_1;
204         (*(C22+2)) = LocalM5_2 + T2_2;
205         (*(C22+3)) = LocalM5_3 + T2_3;
206         (*(C21  )) = (- *(C21  )) + T2_0;
207         (*(C21+1)) = (- *(C21+1)) + T2_1;
208         (*(C21+2)) = (- *(C21+2)) + T2_2;
209         (*(C21+3)) = (- *(C21+3)) + T2_3;
210         M5 += 4;
211         M2 += 4;
212         T1sMULT += 4;
213         C11 += 4;
214         C12 += 4;
215         C21 += 4;
216         C22 += 4;
217       }
218       C11 = (REAL*) ( ((PTR) C11 ) + RowIncrementC);
219       C12 = (REAL*) ( ((PTR) C12 ) + RowIncrementC);
220       C21 = (REAL*) ( ((PTR) C21 ) + RowIncrementC);
221       C22 = (REAL*) ( ((PTR) C22 ) + RowIncrementC);
222     }
223     free(StartHeap);
224   });
225 
226   end_task.gather(tasks);
227 }
228 
strassen_taskflow(unsigned num_threads,REAL * A,REAL * B,REAL * C,int n)229 void strassen_taskflow(unsigned num_threads, REAL *A, REAL *B, REAL *C, int n) {
230   tf::Taskflow flow;
231 
232   flow.emplace(
233     [=](auto &subflow) {
234       OptimizedStrassenMultiply_tf(C, A, B, n, n, n, n, 1, subflow);
235     }
236   );
237 
238   tf::Executor(num_threads).run(flow).wait();
239 }
240 
measure_time_taskflow(unsigned num_threads,REAL * A,REAL * B,REAL * C,int n)241 std::chrono::microseconds measure_time_taskflow(unsigned num_threads, REAL *A, REAL *B, REAL *C, int n) {
242   auto beg = std::chrono::high_resolution_clock::now();
243   strassen_taskflow(num_threads, A, B, C, n);
244   auto end = std::chrono::high_resolution_clock::now();
245   return std::chrono::duration_cast<std::chrono::microseconds>(end - beg);
246 }
247 
248