1 // =============================================================================
2 // === GPUQREngine/Source/Scheduler_Front.cpp ==================================
3 // =============================================================================
4 //
5 // This file contains code to manage fronts within the scheduler.
6 //
7 // The following functions are implemented:
8 //
9 //  - activateFront
10 //    This function adds the front to the permutation of active fronts,
11 //    configures the inverse permutation for O(1) lookups, and sets the
12 //    initial factorization state of the front.
13 //
14 //  - pullFrontData
15 //    This function coordinates the asynchronous pull of the R factor off of
16 //    the GPU as soon as it is available. This function uses the cuda events
17 //    and streams model
18 //
19 //  - finishFront
20 //    This function is the inverse of activateFront. It removes the front from
21 //    the list of active fronts. The call is idempotent and coordinates with
22 //    the cuda events and streams responsible for pulling the R factor in order
23 //    to not accidentally free a front whose R factor is still in transit.
24 //
25 // =============================================================================
26 
27 #include "GPUQREngine_Scheduler.hpp"
28 
29 
30 // -----------------------------------------------------------------------------
31 // Scheduler::activateFront
32 // -----------------------------------------------------------------------------
33 
activateFront(Int f)34 void Scheduler::activateFront
35 (
36     Int f                                          // The front id to manipulate
37 )
38 {
39     /* If the front has already been activated, exit early. */
40     if(afPinv[f] != EMPTY) return;
41 
42     Front *front = (&frontList[f]);
43 
44     /* Add this front to the list of active fronts. */
45     afPerm[numActiveFronts] = f;
46     afPinv[f] = numActiveFronts;
47     numActiveFronts++;
48 
49     /* If the front is dense then there are no rows of S to assemble. */
50     if(front->isDense())
51     {
52         front->state = FACTORIZE ;
53     }
54     /* Else the front is sparse: */
55     else
56     {
57         /* If we're only doing a push assembly, jump to parent wait. */
58         if(front->sparseMeta.pushOnly)
59         {
60             front->state = PARENT_WAIT;
61         }
62         /* Else we are doing a full factorization of this front; assemble S. */
63         else
64         {
65             front->state = ASSEMBLE_S;
66         }
67     }
68 }
69 
70 // -----------------------------------------------------------------------------
71 // Scheduler::pullFrontData
72 // -----------------------------------------------------------------------------
73 
pullFrontData(Int f)74 bool Scheduler::pullFrontData
75 (
76     Int f                                          // The front id to manipulate
77 )
78 {
79     /* Grab the front descriptor. */
80     Front *front = (&frontList[f]);
81 
82     /* If we're only doing a push assembly then there's nothing to pull. */
83     if(front->isPushOnly()) return true;
84 
85     /* If we already pulled the R factor, return early. */
86     if(FrontDataPulled[f]) return true;
87 
88     /* If the R factor isn't actually ready yet, return false.
89      * This can happen if the kernel responsible for finishing the factorization
90      * is running while we're trying to execute this subroutine. */
91     // assert(eventFrontDataReady[f] != NULL);
92     if(cudaEventQuery(eventFrontDataReady[f]) != cudaSuccess){ return false; }
93     cudaEventDestroy(eventFrontDataReady[f]);
94 
95     /* Use an event to signal when the R factor is off the GPU. */
96     cudaEventCreate(&eventFrontDataPulled[f]);
97 
98     /* Determine how many values to pull back from the GPU: */
99 
100     /* We always pull R. */
101     Int numValuesToPull = front->getNumRValues();
102 
103     /* If we're doing a sparse factorization and this front is staged,
104        we also need to pull the contribution block rows. */
105     if(front->isStaged())
106     {
107         SparseMeta *meta = &(front->sparseMeta);
108         numValuesToPull += meta->cm * front->fn;
109     }
110 
111     /* Surgically transfer the data across the D2H stream. */
112     Workspace wsR = Workspace(numValuesToPull, sizeof(double));
113     wsR.assign(front->cpuR, front->gpuF);
114     wsR.transfer(cudaMemcpyDeviceToHost, false, memoryStreamD2H);
115     wsR.assign(NULL, NULL);
116 
117     /* Record the event to signal when R is off the GPU. */
118     cudaEventRecord(eventFrontDataPulled[f]);
119 
120     /* Save and return that we've initiated the R factor pull. */
121     return (FrontDataPulled[f] = true);
122 }
123 
124 // -----------------------------------------------------------------------------
125 // Scheduler::finishFront
126 // -----------------------------------------------------------------------------
127 
finishFront(Int f)128 bool Scheduler::finishFront
129 (
130     Int f                                          // The front id to manipulate
131 )
132 {
133     /* If we've already freed the front, return early. */
134     if(afPinv[f] == EMPTY) return true;
135 
136     Front *front = (&frontList[f]);
137 
138     /* If we're doing more than a push, we need to get the data off the GPU. */
139     if(!front->isPushOnly())
140     {
141         /* Non-blocking guard to make sure front data is off the GPU. */
142         if(cudaEventQuery(eventFrontDataPulled[f]) != cudaSuccess)
143         {
144             return false;
145         }
146         cudaEventDestroy(eventFrontDataPulled[f]);
147     }
148 
149     /* Remove the front from the active fronts. */
150     numActiveFronts--;
151     if(numActiveFronts > 0)
152     {
153         /* Replace the active front slot with the last front in the list. */
154         Int replacer = afPerm[numActiveFronts];
155         Int position = afPinv[f];
156         afPerm[position] = replacer;
157         afPinv[replacer] = position;
158     }
159     afPinv[f] = EMPTY;
160 
161     /* If we got through this method, we have successfully freed the front. */
162     return true;
163 }
164 
165 // -----------------------------------------------------------------------------
166 // debugDumpFront
167 // -----------------------------------------------------------------------------
168 
169 #if 1
debugDumpFront(Front * front)170 void Scheduler::debugDumpFront(Front *front)
171 {
172     Workspace *wsFront =
173         Workspace::allocate (front->getNumFrontValues(),     // CPU, DEBUG ONLY
174         sizeof(double), false, true, false, false);
175     double *F = CPU_REFERENCE(wsFront, double*);
176     Int fm = front->fm;
177     Int fn = front->fn;
178     wsFront->assign(wsFront->cpu(), front->gpuF);
179     wsFront->transfer(cudaMemcpyDeviceToHost);
180     printf("--- %g ---\n", (double) (front->fidg));
181 
182 //  for(Int i=0; i<fm; i++)
183 //  {
184 //      for(Int j=0; j<fn; j++)
185 //      {
186 //          printf("%16.8e ", F[i*fn+j]);
187 //      }
188 //      printf("\n");
189 //  }
190 
191     for (Int j = 0 ; j < fn ; j++)
192     {
193         printf ("   --- column %ld of %ld\n", j, fn) ;
194         for (Int i = 0 ; i < fm ; i++)
195         {
196             if (i == j) printf ("      [ diag:     ") ;
197             else        printf ("      row %4ld    ", i) ;
198             printf (" %10.4g", F [fn*i+j]) ;
199             if (i == j) printf (" ]\n") ;
200             else        printf ("\n") ;
201         }
202         printf ("\n") ;
203     }
204 
205     printf("----------\n", front->fidg);
206     wsFront->assign(wsFront->cpu(), NULL);
207     wsFront = Workspace::destroy(wsFront);
208 }
209 #endif
210