1 // Copyright 2009-2021 Intel Corporation
2 // SPDX-License-Identifier: Apache-2.0
3 
4 #include "DistributedLoadBalancer.h"
5 #include <algorithm>
6 #include <limits>
7 #include <map>
8 #include "../fb/DistributedFrameBuffer.h"
9 #include "WriteMultipleTileOperation.h"
10 #include "camera/Camera.h"
11 #include "common/MPICommon.h"
12 #include "common/Profiling.h"
13 #include "distributed/DistributedRenderer.h"
14 #include "rkcommon/tasking/parallel_for.h"
15 
16 namespace ospray {
17 namespace mpi {
18 namespace staticLoadBalancer {
19 using namespace mpicommon;
20 using namespace rkcommon;
21 
renderFrame(FrameBuffer * _fb,Renderer * _renderer,Camera * camera,World * _world)22 void Distributed::renderFrame(
23     FrameBuffer *_fb, Renderer *_renderer, Camera *camera, World *_world)
24 {
25   auto *dfb = dynamic_cast<DistributedFrameBuffer *>(_fb);
26 
27   auto *world = dynamic_cast<DistributedWorld *>(_world);
28   if (!world) {
29     throw std::runtime_error(
30         "Distributed Load Balancer only supports DistributedWorld!");
31   }
32 
33   auto *renderer = dynamic_cast<DistributedRenderer *>(_renderer);
34   if (!renderer) {
35     if (world->allRegions.size() == 1) {
36       renderFrameReplicated(dfb, _renderer, camera, world);
37       return;
38     } else {
39       throw std::runtime_error(
40           "Distributed rendering requires a "
41           "distributed renderer!");
42     }
43   }
44 
45   if (dfb->getLastRenderer() != renderer) {
46     dfb->setTileOperation(renderer->tileOperation(), renderer);
47   }
48 
49   dfb->startNewFrame(renderer->errorThreshold);
50   void *perFrameData = renderer->beginFrame(dfb, world);
51   const size_t numRegions = world->allRegions.size();
52 
53 #ifdef ENABLE_PROFILING
54   ProfilingPoint start = ProfilingPoint();
55 #endif
56   std::set<int> tilesForFrame;
57   for (int i = workerRank(); i < dfb->getTotalTiles(); i += workerSize()) {
58     const uint32_t tile_y = i / dfb->getNumTiles().x;
59     const uint32_t tile_x = i - tile_y * dfb->getNumTiles().x;
60     const vec2i tileID(tile_x, tile_y);
61     // Skip tiles that have been rendered to satisfactory error level
62     if (dfb->tileError(tileID) > renderer->errorThreshold) {
63       tilesForFrame.insert(i);
64     }
65   }
66 
67   const vec2i numTiles = dfb->getNumTiles();
68   const vec2i fbSize = dfb->getNumPixels();
69   for (const auto &id : world->myRegionIds) {
70     const auto &r = world->allRegions[id];
71     box3f proj = camera->projectBox(r);
72     box2f screenRegion(vec2f(proj.lower) * fbSize, vec2f(proj.upper) * fbSize);
73 
74     // Pad the region a bit
75     screenRegion.lower = max(screenRegion.lower - TILE_SIZE, vec2f(0.f));
76     screenRegion.upper = min(screenRegion.upper + TILE_SIZE, vec2f(fbSize));
77 
78     // Skip regions that are completely behind the camera
79     if (proj.upper.z < 0.f) {
80       continue;
81     }
82 
83     box2i tileRegion;
84     tileRegion.lower = screenRegion.lower / TILE_SIZE;
85     tileRegion.upper = vec2i(std::ceil(screenRegion.upper.x / TILE_SIZE),
86         std::ceil(screenRegion.upper.y / TILE_SIZE));
87     tileRegion.upper = min(tileRegion.upper, numTiles);
88     for (int y = tileRegion.lower.y; y < tileRegion.upper.y; ++y) {
89       for (int x = tileRegion.lower.x; x < tileRegion.upper.x; ++x) {
90         // Skip tiles that have been rendered to satisfactory error level
91         if (dfb->tileError(vec2i(x, y)) <= renderer->errorThreshold) {
92           continue;
93         }
94 
95         const int tileIndex = x + y * numTiles.x;
96         const auto &owners = world->regionOwners[id];
97         const size_t numRegionOwners = owners.size();
98         const size_t ownerRank =
99             std::distance(owners.begin(), owners.find(workerRank()));
100         // TODO: Can we do a better than round-robin over all tiles
101         // assignment here? It could be that we end up not evenly dividing
102         // the workload.
103         const bool regionTileOwner = (tileIndex % numRegionOwners) == ownerRank;
104         if (regionTileOwner) {
105           tilesForFrame.insert(tileIndex);
106         }
107       }
108     }
109   }
110 #ifdef ENABLE_PROFILING
111   ProfilingPoint end = ProfilingPoint();
112   std::cout << "Initial tile for frame determination "
113             << elapsedTimeMs(start, end) << "ms\n";
114 #endif
115 
116 #ifdef ENABLE_PROFILING
117   start = ProfilingPoint();
118 #endif
119   tasking::parallel_for(tilesForFrame.size(), [&](size_t taskId) {
120     auto tileIter = tilesForFrame.begin();
121     std::advance(tileIter, taskId);
122     const int tileIndex = *tileIter;
123     const uint32_t numTiles_x = static_cast<uint32_t>(dfb->getNumTiles().x);
124     const uint32_t tile_y = tileIndex / numTiles_x;
125     const uint32_t tile_x = tileIndex - tile_y * numTiles_x;
126     const vec2i tileID(tile_x, tile_y);
127     const int32 accumID = dfb->accumID(tileID);
128     const bool tileOwner = (tileIndex % workerSize()) == workerRank();
129     const int NUM_JOBS = (TILE_SIZE * TILE_SIZE) / RENDERTILE_PIXELS_PER_JOB;
130 
131     const auto fbSize = dfb->getNumPixels();
132 
133     Tile __aligned(64) bgtile(tileID, fbSize, accumID);
134 
135     // The visibility entries are sorted by the region id, matching
136     // the order of the allRegions vector.
137     bool *regionVisible = STACK_BUFFER(bool, numRegions);
138     std::fill(regionVisible, regionVisible + numRegions, false);
139 
140     // The first renderTile doesn't actually do any rendering, and instead
141     // just computes which tiles the region projects to, giving us the
142     // exact bounds of the region's projection onto the image
143     tasking::parallel_for(static_cast<size_t>(NUM_JOBS), [&](size_t tIdx) {
144       renderer->computeRegionVisibility(
145           dfb, camera, world, regionVisible, perFrameData, bgtile, tIdx);
146     });
147 
148     // If we own the tile send the background color and the count of
149     // children for the number of regions projecting to it that will be
150     // sent.
151     if (tileOwner) {
152       bgtile.sortOrder = std::numeric_limits<int32_t>::max();
153       bgtile.generation = 0;
154       bgtile.children = 0;
155       // Note: not using std::count here as seems to not count properly in debug
156       // builds
157       for (size_t i = 0; i < numRegions; ++i) {
158         if (regionVisible[i]) {
159           ++bgtile.children;
160         }
161       }
162       dfb->setTile(bgtile);
163     }
164 
165     // Render our regions that project to this tile and ship them off
166     // to the tile owner.
167     std::vector<size_t> myVisibleRegions;
168     myVisibleRegions.reserve(world->myRegionIds.size());
169     for (const auto &rid : world->myRegionIds) {
170       if (regionVisible[rid]) {
171         myVisibleRegions.push_back(rid);
172       }
173     }
174     // If none of our regions are visible, we're done
175     if (myVisibleRegions.empty()) {
176       return;
177     }
178 
179     // TODO: Will it really be much benefit to run the regions in parallel
180     // as well? We already are running in parallel on the tiles and the
181     // pixels within the tiles, so adding another level may actually just
182     // give us worse cache coherence.
183 #define PARALLEL_REGION_RENDERING 1
184 #if PARALLEL_REGION_RENDERING
185     tasking::parallel_for(myVisibleRegions.size(), [&](size_t vid) {
186       const size_t rid = myVisibleRegions[vid];
187       Tile __aligned(64) tile(tileID, fbSize, accumID);
188 #else
189       for (const size_t &rid : myVisibleRegions) {
190         Tile &tile = bgtile;
191 #endif
192       tile.generation = 1;
193       tile.children = 0;
194       // If we share ownership of this region but aren't responsible
195       // for rendering it to this tile, don't render it.
196       // Note that we do need to double check here, since we could have
197       // multiple shared regions projecting to the same tile, and we
198       // could be the region tile owner for only some of those
199       const auto &owners = world->regionOwners[rid];
200       const size_t numRegionOwners = owners.size();
201       const size_t ownerRank =
202           std::distance(owners.begin(), owners.find(workerRank()));
203       const bool regionTileOwner = (tileIndex % numRegionOwners) == ownerRank;
204       if (regionTileOwner) {
205         tasking::parallel_for(NUM_JOBS, [&](int tIdx) {
206           renderer->renderRegionToTile(dfb,
207               camera,
208               world,
209               world->allRegions[rid],
210               perFrameData,
211               tile,
212               tIdx);
213         });
214         // Unused
215         // tile.sortOrder = sortOrder[rid];
216         dfb->setTile(tile);
217       }
218 #if PARALLEL_REGION_RENDERING
219     });
220 #else
221     }
222 #endif
223   });
224 #ifdef ENABLE_PROFILING
225   end = ProfilingPoint();
226   std::cout << "Local rendering for frame " << elapsedTimeMs(start, end)
227             << "ms\n";
228 #endif
229 
230   dfb->waitUntilFinished();
231   renderer->endFrame(dfb, perFrameData);
232 
233   dfb->endFrame(renderer->errorThreshold, camera);
234 }
235 
renderFrameReplicated(DistributedFrameBuffer * fb,Renderer * renderer,Camera * camera,DistributedWorld * world)236 void Distributed::renderFrameReplicated(DistributedFrameBuffer *fb,
237     Renderer *renderer,
238     Camera *camera,
239     DistributedWorld *world)
240 {
241   std::shared_ptr<TileOperation> tileOperation = nullptr;
242   if (fb->getLastRenderer() != renderer) {
243     tileOperation = std::make_shared<WriteMultipleTileOperation>();
244     fb->setTileOperation(tileOperation, renderer);
245   } else {
246     tileOperation = fb->getTileOperation();
247   }
248 
249 #ifdef ENABLE_PROFILING
250   ProfilingPoint start;
251 #endif
252   fb->startNewFrame(renderer->errorThreshold);
253   void *perFrameData = renderer->beginFrame(fb, world);
254 #ifdef ENABLE_PROFILING
255   ProfilingPoint end;
256   std::cout << "Start new frame took: " << elapsedTimeMs(start, end) << "ms\n";
257 #endif
258 
259   const auto fbSize = fb->getNumPixels();
260 
261   const int ALLTASKS = fb->getTotalTiles();
262   int NTASKS = ALLTASKS / workerSize();
263 
264   // NOTE(jda) - If all tiles do not divide evenly among all worker ranks
265   //             (a.k.a. ALLTASKS / worker.size has a remainder), then
266   //             some ranks will have one extra tile to do. Thus NTASKS
267   //             is incremented if we are one of those ranks.
268   if ((ALLTASKS % workerSize()) > workerRank())
269     NTASKS++;
270 
271 #ifdef ENABLE_PROFILING
272   start = ProfilingPoint();
273 #endif
274   /* TODO WILL: This can dispatch back to LocalTiledLoadBalancer::renderTiles
275    * to render the tiles instead of repeating this loop here ourselves.
276    */
277   tasking::parallel_for(NTASKS, [&](int taskIndex) {
278     const size_t tileID = taskIndex * workerSize() + workerRank();
279     const size_t numTiles_x = fb->getNumTiles().x;
280     const size_t tile_y = tileID / numTiles_x;
281     const size_t tile_x = tileID - tile_y * numTiles_x;
282     const vec2i tileId(tile_x, tile_y);
283     const int32 accumID = fb->accumID(tileId);
284 
285     if (fb->tileError(tileId) <= renderer->errorThreshold)
286       return;
287 
288 #if TILE_SIZE > MAX_TILE_SIZE
289     auto tilePtr = make_unique<Tile>(tileId, fbSize, accumID);
290     auto &tile = *tilePtr;
291 #else
292           Tile __aligned(64) tile(tileId, fbSize, accumID);
293 #endif
294 
295     if (!fb->frameCancelled()) {
296       tasking::parallel_for(numJobs(renderer->spp, accumID), [&](size_t tid) {
297         renderer->renderTile(fb, camera, world, perFrameData, tile, tid);
298       });
299     }
300 
301     fb->setTile(tile);
302   });
303 #ifdef ENABLE_PROFILING
304   end = ProfilingPoint();
305   std::cout << "Render loop took: " << elapsedTimeMs(start, end)
306             << "ms, CPU %: " << cpuUtilization(start, end) << "%\n";
307 
308   start = ProfilingPoint();
309 #endif
310 
311   fb->waitUntilFinished();
312 
313 #ifdef ENABLE_PROFILING
314   end = ProfilingPoint();
315   std::cout << "Wait finished took: " << elapsedTimeMs(start, end)
316             << "ms, CPU %: " << cpuUtilization(start, end) << "%\n";
317 
318   start = ProfilingPoint();
319 #endif
320 
321   renderer->endFrame(fb, perFrameData);
322   fb->endFrame(renderer->errorThreshold, camera);
323 
324 #ifdef ENABLE_PROFILING
325   end = ProfilingPoint();
326   std::cout << "End frame took: " << elapsedTimeMs(start, end)
327             << "ms, CPU %: " << cpuUtilization(start, end) << "%\n";
328 #endif
329 }
330 
toString() const331 std::string Distributed::toString() const
332 {
333   return "ospray::mpi::staticLoadBalancer::Distributed";
334 }
335 
renderTiles(FrameBuffer *,Renderer *,Camera *,World *,const utility::ArrayView<int> &,void *)336 void Distributed::renderTiles(FrameBuffer *,
337     Renderer *,
338     Camera *,
339     World *,
340     const utility::ArrayView<int> &,
341     void *)
342 {
343   NOT_IMPLEMENTED;
344 }
345 
346 } // namespace staticLoadBalancer
347 } // namespace mpi
348 } // namespace ospray
349