1 // =============================================================================
2 // === GPUQREngine/Source/Scheduler_Front.cpp ==================================
3 // =============================================================================
4 //
5 // This file contains code to manage fronts within the scheduler.
6 //
7 // The following functions are implemented:
8 //
9 // - activateFront
10 // This function adds the front to the permutation of active fronts,
11 // configures the inverse permutation for O(1) lookups, and sets the
12 // initial factorization state of the front.
13 //
14 // - pullFrontData
15 // This function coordinates the asynchronous pull of the R factor off of
16 // the GPU as soon as it is available. This function uses the cuda events
17 // and streams model
18 //
19 // - finishFront
20 // This function is the inverse of activateFront. It removes the front from
21 // the list of active fronts. The call is idempotent and coordinates with
22 // the cuda events and streams responsible for pulling the R factor in order
23 // to not accidentally free a front whose R factor is still in transit.
24 //
25 // =============================================================================
26
27 #include "GPUQREngine_Scheduler.hpp"
28
29
30 // -----------------------------------------------------------------------------
31 // Scheduler::activateFront
32 // -----------------------------------------------------------------------------
33
activateFront(Int f)34 void Scheduler::activateFront
35 (
36 Int f // The front id to manipulate
37 )
38 {
39 /* If the front has already been activated, exit early. */
40 if(afPinv[f] != EMPTY) return;
41
42 Front *front = (&frontList[f]);
43
44 /* Add this front to the list of active fronts. */
45 afPerm[numActiveFronts] = f;
46 afPinv[f] = numActiveFronts;
47 numActiveFronts++;
48
49 /* If the front is dense then there are no rows of S to assemble. */
50 if(front->isDense())
51 {
52 front->state = FACTORIZE ;
53 }
54 /* Else the front is sparse: */
55 else
56 {
57 /* If we're only doing a push assembly, jump to parent wait. */
58 if(front->sparseMeta.pushOnly)
59 {
60 front->state = PARENT_WAIT;
61 }
62 /* Else we are doing a full factorization of this front; assemble S. */
63 else
64 {
65 front->state = ASSEMBLE_S;
66 }
67 }
68 }
69
70 // -----------------------------------------------------------------------------
71 // Scheduler::pullFrontData
72 // -----------------------------------------------------------------------------
73
pullFrontData(Int f)74 bool Scheduler::pullFrontData
75 (
76 Int f // The front id to manipulate
77 )
78 {
79 /* Grab the front descriptor. */
80 Front *front = (&frontList[f]);
81
82 /* If we're only doing a push assembly then there's nothing to pull. */
83 if(front->isPushOnly()) return true;
84
85 /* If we already pulled the R factor, return early. */
86 if(FrontDataPulled[f]) return true;
87
88 /* If the R factor isn't actually ready yet, return false.
89 * This can happen if the kernel responsible for finishing the factorization
90 * is running while we're trying to execute this subroutine. */
91 // assert(eventFrontDataReady[f] != NULL);
92 if(cudaEventQuery(eventFrontDataReady[f]) != cudaSuccess){ return false; }
93 cudaEventDestroy(eventFrontDataReady[f]);
94
95 /* Use an event to signal when the R factor is off the GPU. */
96 cudaEventCreate(&eventFrontDataPulled[f]);
97
98 /* Determine how many values to pull back from the GPU: */
99
100 /* We always pull R. */
101 Int numValuesToPull = front->getNumRValues();
102
103 /* If we're doing a sparse factorization and this front is staged,
104 we also need to pull the contribution block rows. */
105 if(front->isStaged())
106 {
107 SparseMeta *meta = &(front->sparseMeta);
108 numValuesToPull += meta->cm * front->fn;
109 }
110
111 /* Surgically transfer the data across the D2H stream. */
112 Workspace wsR = Workspace(numValuesToPull, sizeof(double));
113 wsR.assign(front->cpuR, front->gpuF);
114 wsR.transfer(cudaMemcpyDeviceToHost, false, memoryStreamD2H);
115 wsR.assign(NULL, NULL);
116
117 /* Record the event to signal when R is off the GPU. */
118 cudaEventRecord(eventFrontDataPulled[f]);
119
120 /* Save and return that we've initiated the R factor pull. */
121 return (FrontDataPulled[f] = true);
122 }
123
124 // -----------------------------------------------------------------------------
125 // Scheduler::finishFront
126 // -----------------------------------------------------------------------------
127
finishFront(Int f)128 bool Scheduler::finishFront
129 (
130 Int f // The front id to manipulate
131 )
132 {
133 /* If we've already freed the front, return early. */
134 if(afPinv[f] == EMPTY) return true;
135
136 Front *front = (&frontList[f]);
137
138 /* If we're doing more than a push, we need to get the data off the GPU. */
139 if(!front->isPushOnly())
140 {
141 /* Non-blocking guard to make sure front data is off the GPU. */
142 if(cudaEventQuery(eventFrontDataPulled[f]) != cudaSuccess)
143 {
144 return false;
145 }
146 cudaEventDestroy(eventFrontDataPulled[f]);
147 }
148
149 /* Remove the front from the active fronts. */
150 numActiveFronts--;
151 if(numActiveFronts > 0)
152 {
153 /* Replace the active front slot with the last front in the list. */
154 Int replacer = afPerm[numActiveFronts];
155 Int position = afPinv[f];
156 afPerm[position] = replacer;
157 afPinv[replacer] = position;
158 }
159 afPinv[f] = EMPTY;
160
161 /* If we got through this method, we have successfully freed the front. */
162 return true;
163 }
164
165 // -----------------------------------------------------------------------------
166 // debugDumpFront
167 // -----------------------------------------------------------------------------
168
169 #if 1
debugDumpFront(Front * front)170 void Scheduler::debugDumpFront(Front *front)
171 {
172 Workspace *wsFront =
173 Workspace::allocate (front->getNumFrontValues(), // CPU, DEBUG ONLY
174 sizeof(double), false, true, false, false);
175 double *F = CPU_REFERENCE(wsFront, double*);
176 Int fm = front->fm;
177 Int fn = front->fn;
178 wsFront->assign(wsFront->cpu(), front->gpuF);
179 wsFront->transfer(cudaMemcpyDeviceToHost);
180 printf("--- %g ---\n", (double) (front->fidg));
181
182 // for(Int i=0; i<fm; i++)
183 // {
184 // for(Int j=0; j<fn; j++)
185 // {
186 // printf("%16.8e ", F[i*fn+j]);
187 // }
188 // printf("\n");
189 // }
190
191 for (Int j = 0 ; j < fn ; j++)
192 {
193 printf (" --- column %ld of %ld\n", j, fn) ;
194 for (Int i = 0 ; i < fm ; i++)
195 {
196 if (i == j) printf (" [ diag: ") ;
197 else printf (" row %4ld ", i) ;
198 printf (" %10.4g", F [fn*i+j]) ;
199 if (i == j) printf (" ]\n") ;
200 else printf ("\n") ;
201 }
202 printf ("\n") ;
203 }
204
205 printf("----------\n", front->fidg);
206 wsFront->assign(wsFront->cpu(), NULL);
207 wsFront = Workspace::destroy(wsFront);
208 }
209 #endif
210