1 // Copyright 2009-2021 Intel Corporation
2 // SPDX-License-Identifier: Apache-2.0
3
4 #include "DistributedLoadBalancer.h"
5 #include <algorithm>
6 #include <limits>
7 #include <map>
8 #include "../fb/DistributedFrameBuffer.h"
9 #include "WriteMultipleTileOperation.h"
10 #include "camera/Camera.h"
11 #include "common/MPICommon.h"
12 #include "common/Profiling.h"
13 #include "distributed/DistributedRenderer.h"
14 #include "rkcommon/tasking/parallel_for.h"
15
16 namespace ospray {
17 namespace mpi {
18 namespace staticLoadBalancer {
19 using namespace mpicommon;
20 using namespace rkcommon;
21
renderFrame(FrameBuffer * _fb,Renderer * _renderer,Camera * camera,World * _world)22 void Distributed::renderFrame(
23 FrameBuffer *_fb, Renderer *_renderer, Camera *camera, World *_world)
24 {
25 auto *dfb = dynamic_cast<DistributedFrameBuffer *>(_fb);
26
27 auto *world = dynamic_cast<DistributedWorld *>(_world);
28 if (!world) {
29 throw std::runtime_error(
30 "Distributed Load Balancer only supports DistributedWorld!");
31 }
32
33 auto *renderer = dynamic_cast<DistributedRenderer *>(_renderer);
34 if (!renderer) {
35 if (world->allRegions.size() == 1) {
36 renderFrameReplicated(dfb, _renderer, camera, world);
37 return;
38 } else {
39 throw std::runtime_error(
40 "Distributed rendering requires a "
41 "distributed renderer!");
42 }
43 }
44
45 if (dfb->getLastRenderer() != renderer) {
46 dfb->setTileOperation(renderer->tileOperation(), renderer);
47 }
48
49 dfb->startNewFrame(renderer->errorThreshold);
50 void *perFrameData = renderer->beginFrame(dfb, world);
51 const size_t numRegions = world->allRegions.size();
52
53 #ifdef ENABLE_PROFILING
54 ProfilingPoint start = ProfilingPoint();
55 #endif
56 std::set<int> tilesForFrame;
57 for (int i = workerRank(); i < dfb->getTotalTiles(); i += workerSize()) {
58 const uint32_t tile_y = i / dfb->getNumTiles().x;
59 const uint32_t tile_x = i - tile_y * dfb->getNumTiles().x;
60 const vec2i tileID(tile_x, tile_y);
61 // Skip tiles that have been rendered to satisfactory error level
62 if (dfb->tileError(tileID) > renderer->errorThreshold) {
63 tilesForFrame.insert(i);
64 }
65 }
66
67 const vec2i numTiles = dfb->getNumTiles();
68 const vec2i fbSize = dfb->getNumPixels();
69 for (const auto &id : world->myRegionIds) {
70 const auto &r = world->allRegions[id];
71 box3f proj = camera->projectBox(r);
72 box2f screenRegion(vec2f(proj.lower) * fbSize, vec2f(proj.upper) * fbSize);
73
74 // Pad the region a bit
75 screenRegion.lower = max(screenRegion.lower - TILE_SIZE, vec2f(0.f));
76 screenRegion.upper = min(screenRegion.upper + TILE_SIZE, vec2f(fbSize));
77
78 // Skip regions that are completely behind the camera
79 if (proj.upper.z < 0.f) {
80 continue;
81 }
82
83 box2i tileRegion;
84 tileRegion.lower = screenRegion.lower / TILE_SIZE;
85 tileRegion.upper = vec2i(std::ceil(screenRegion.upper.x / TILE_SIZE),
86 std::ceil(screenRegion.upper.y / TILE_SIZE));
87 tileRegion.upper = min(tileRegion.upper, numTiles);
88 for (int y = tileRegion.lower.y; y < tileRegion.upper.y; ++y) {
89 for (int x = tileRegion.lower.x; x < tileRegion.upper.x; ++x) {
90 // Skip tiles that have been rendered to satisfactory error level
91 if (dfb->tileError(vec2i(x, y)) <= renderer->errorThreshold) {
92 continue;
93 }
94
95 const int tileIndex = x + y * numTiles.x;
96 const auto &owners = world->regionOwners[id];
97 const size_t numRegionOwners = owners.size();
98 const size_t ownerRank =
99 std::distance(owners.begin(), owners.find(workerRank()));
100 // TODO: Can we do a better than round-robin over all tiles
101 // assignment here? It could be that we end up not evenly dividing
102 // the workload.
103 const bool regionTileOwner = (tileIndex % numRegionOwners) == ownerRank;
104 if (regionTileOwner) {
105 tilesForFrame.insert(tileIndex);
106 }
107 }
108 }
109 }
110 #ifdef ENABLE_PROFILING
111 ProfilingPoint end = ProfilingPoint();
112 std::cout << "Initial tile for frame determination "
113 << elapsedTimeMs(start, end) << "ms\n";
114 #endif
115
116 #ifdef ENABLE_PROFILING
117 start = ProfilingPoint();
118 #endif
119 tasking::parallel_for(tilesForFrame.size(), [&](size_t taskId) {
120 auto tileIter = tilesForFrame.begin();
121 std::advance(tileIter, taskId);
122 const int tileIndex = *tileIter;
123 const uint32_t numTiles_x = static_cast<uint32_t>(dfb->getNumTiles().x);
124 const uint32_t tile_y = tileIndex / numTiles_x;
125 const uint32_t tile_x = tileIndex - tile_y * numTiles_x;
126 const vec2i tileID(tile_x, tile_y);
127 const int32 accumID = dfb->accumID(tileID);
128 const bool tileOwner = (tileIndex % workerSize()) == workerRank();
129 const int NUM_JOBS = (TILE_SIZE * TILE_SIZE) / RENDERTILE_PIXELS_PER_JOB;
130
131 const auto fbSize = dfb->getNumPixels();
132
133 Tile __aligned(64) bgtile(tileID, fbSize, accumID);
134
135 // The visibility entries are sorted by the region id, matching
136 // the order of the allRegions vector.
137 bool *regionVisible = STACK_BUFFER(bool, numRegions);
138 std::fill(regionVisible, regionVisible + numRegions, false);
139
140 // The first renderTile doesn't actually do any rendering, and instead
141 // just computes which tiles the region projects to, giving us the
142 // exact bounds of the region's projection onto the image
143 tasking::parallel_for(static_cast<size_t>(NUM_JOBS), [&](size_t tIdx) {
144 renderer->computeRegionVisibility(
145 dfb, camera, world, regionVisible, perFrameData, bgtile, tIdx);
146 });
147
148 // If we own the tile send the background color and the count of
149 // children for the number of regions projecting to it that will be
150 // sent.
151 if (tileOwner) {
152 bgtile.sortOrder = std::numeric_limits<int32_t>::max();
153 bgtile.generation = 0;
154 bgtile.children = 0;
155 // Note: not using std::count here as seems to not count properly in debug
156 // builds
157 for (size_t i = 0; i < numRegions; ++i) {
158 if (regionVisible[i]) {
159 ++bgtile.children;
160 }
161 }
162 dfb->setTile(bgtile);
163 }
164
165 // Render our regions that project to this tile and ship them off
166 // to the tile owner.
167 std::vector<size_t> myVisibleRegions;
168 myVisibleRegions.reserve(world->myRegionIds.size());
169 for (const auto &rid : world->myRegionIds) {
170 if (regionVisible[rid]) {
171 myVisibleRegions.push_back(rid);
172 }
173 }
174 // If none of our regions are visible, we're done
175 if (myVisibleRegions.empty()) {
176 return;
177 }
178
179 // TODO: Will it really be much benefit to run the regions in parallel
180 // as well? We already are running in parallel on the tiles and the
181 // pixels within the tiles, so adding another level may actually just
182 // give us worse cache coherence.
183 #define PARALLEL_REGION_RENDERING 1
184 #if PARALLEL_REGION_RENDERING
185 tasking::parallel_for(myVisibleRegions.size(), [&](size_t vid) {
186 const size_t rid = myVisibleRegions[vid];
187 Tile __aligned(64) tile(tileID, fbSize, accumID);
188 #else
189 for (const size_t &rid : myVisibleRegions) {
190 Tile &tile = bgtile;
191 #endif
192 tile.generation = 1;
193 tile.children = 0;
194 // If we share ownership of this region but aren't responsible
195 // for rendering it to this tile, don't render it.
196 // Note that we do need to double check here, since we could have
197 // multiple shared regions projecting to the same tile, and we
198 // could be the region tile owner for only some of those
199 const auto &owners = world->regionOwners[rid];
200 const size_t numRegionOwners = owners.size();
201 const size_t ownerRank =
202 std::distance(owners.begin(), owners.find(workerRank()));
203 const bool regionTileOwner = (tileIndex % numRegionOwners) == ownerRank;
204 if (regionTileOwner) {
205 tasking::parallel_for(NUM_JOBS, [&](int tIdx) {
206 renderer->renderRegionToTile(dfb,
207 camera,
208 world,
209 world->allRegions[rid],
210 perFrameData,
211 tile,
212 tIdx);
213 });
214 // Unused
215 // tile.sortOrder = sortOrder[rid];
216 dfb->setTile(tile);
217 }
218 #if PARALLEL_REGION_RENDERING
219 });
220 #else
221 }
222 #endif
223 });
224 #ifdef ENABLE_PROFILING
225 end = ProfilingPoint();
226 std::cout << "Local rendering for frame " << elapsedTimeMs(start, end)
227 << "ms\n";
228 #endif
229
230 dfb->waitUntilFinished();
231 renderer->endFrame(dfb, perFrameData);
232
233 dfb->endFrame(renderer->errorThreshold, camera);
234 }
235
renderFrameReplicated(DistributedFrameBuffer * fb,Renderer * renderer,Camera * camera,DistributedWorld * world)236 void Distributed::renderFrameReplicated(DistributedFrameBuffer *fb,
237 Renderer *renderer,
238 Camera *camera,
239 DistributedWorld *world)
240 {
241 std::shared_ptr<TileOperation> tileOperation = nullptr;
242 if (fb->getLastRenderer() != renderer) {
243 tileOperation = std::make_shared<WriteMultipleTileOperation>();
244 fb->setTileOperation(tileOperation, renderer);
245 } else {
246 tileOperation = fb->getTileOperation();
247 }
248
249 #ifdef ENABLE_PROFILING
250 ProfilingPoint start;
251 #endif
252 fb->startNewFrame(renderer->errorThreshold);
253 void *perFrameData = renderer->beginFrame(fb, world);
254 #ifdef ENABLE_PROFILING
255 ProfilingPoint end;
256 std::cout << "Start new frame took: " << elapsedTimeMs(start, end) << "ms\n";
257 #endif
258
259 const auto fbSize = fb->getNumPixels();
260
261 const int ALLTASKS = fb->getTotalTiles();
262 int NTASKS = ALLTASKS / workerSize();
263
264 // NOTE(jda) - If all tiles do not divide evenly among all worker ranks
265 // (a.k.a. ALLTASKS / worker.size has a remainder), then
266 // some ranks will have one extra tile to do. Thus NTASKS
267 // is incremented if we are one of those ranks.
268 if ((ALLTASKS % workerSize()) > workerRank())
269 NTASKS++;
270
271 #ifdef ENABLE_PROFILING
272 start = ProfilingPoint();
273 #endif
274 /* TODO WILL: This can dispatch back to LocalTiledLoadBalancer::renderTiles
275 * to render the tiles instead of repeating this loop here ourselves.
276 */
277 tasking::parallel_for(NTASKS, [&](int taskIndex) {
278 const size_t tileID = taskIndex * workerSize() + workerRank();
279 const size_t numTiles_x = fb->getNumTiles().x;
280 const size_t tile_y = tileID / numTiles_x;
281 const size_t tile_x = tileID - tile_y * numTiles_x;
282 const vec2i tileId(tile_x, tile_y);
283 const int32 accumID = fb->accumID(tileId);
284
285 if (fb->tileError(tileId) <= renderer->errorThreshold)
286 return;
287
288 #if TILE_SIZE > MAX_TILE_SIZE
289 auto tilePtr = make_unique<Tile>(tileId, fbSize, accumID);
290 auto &tile = *tilePtr;
291 #else
292 Tile __aligned(64) tile(tileId, fbSize, accumID);
293 #endif
294
295 if (!fb->frameCancelled()) {
296 tasking::parallel_for(numJobs(renderer->spp, accumID), [&](size_t tid) {
297 renderer->renderTile(fb, camera, world, perFrameData, tile, tid);
298 });
299 }
300
301 fb->setTile(tile);
302 });
303 #ifdef ENABLE_PROFILING
304 end = ProfilingPoint();
305 std::cout << "Render loop took: " << elapsedTimeMs(start, end)
306 << "ms, CPU %: " << cpuUtilization(start, end) << "%\n";
307
308 start = ProfilingPoint();
309 #endif
310
311 fb->waitUntilFinished();
312
313 #ifdef ENABLE_PROFILING
314 end = ProfilingPoint();
315 std::cout << "Wait finished took: " << elapsedTimeMs(start, end)
316 << "ms, CPU %: " << cpuUtilization(start, end) << "%\n";
317
318 start = ProfilingPoint();
319 #endif
320
321 renderer->endFrame(fb, perFrameData);
322 fb->endFrame(renderer->errorThreshold, camera);
323
324 #ifdef ENABLE_PROFILING
325 end = ProfilingPoint();
326 std::cout << "End frame took: " << elapsedTimeMs(start, end)
327 << "ms, CPU %: " << cpuUtilization(start, end) << "%\n";
328 #endif
329 }
330
toString() const331 std::string Distributed::toString() const
332 {
333 return "ospray::mpi::staticLoadBalancer::Distributed";
334 }
335
renderTiles(FrameBuffer *,Renderer *,Camera *,World *,const utility::ArrayView<int> &,void *)336 void Distributed::renderTiles(FrameBuffer *,
337 Renderer *,
338 Camera *,
339 World *,
340 const utility::ArrayView<int> &,
341 void *)
342 {
343 NOT_IMPLEMENTED;
344 }
345
346 } // namespace staticLoadBalancer
347 } // namespace mpi
348 } // namespace ospray
349