1 /** \file
2 * \brief Implementation of class FMEMultipoleKernel.
3 *
4 * \author Martin Gronemann
5 *
6 * \par License:
7 * This file is part of the Open Graph Drawing Framework (OGDF).
8 *
9 * \par
10 * Copyright (C)<br>
11 * See README.md in the OGDF root directory for details.
12 *
13 * \par
14 * This program is free software; you can redistribute it and/or
15 * modify it under the terms of the GNU General Public License
16 * Version 2 or 3 as published by the Free Software Foundation;
17 * see the file LICENSE.txt included in the packaging of this file
18 * for details.
19 *
20 * \par
21 * This program is distributed in the hope that it will be useful,
22 * but WITHOUT ANY WARRANTY; without even the implied warranty of
23 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
24 * GNU General Public License for more details.
25 *
26 * \par
27 * You should have received a copy of the GNU General Public
28 * License along with this program; if not, see
29 * http://www.gnu.org/copyleft/gpl.html
30 */
31
32 #include <ogdf/energybased/fast_multipole_embedder/FMEMultipoleKernel.h>
33
34 namespace ogdf {
35 namespace fast_multipole_embedder {
36
quadtreeConstruction(ArrayPartition & pointPartition)37 void FMEMultipoleKernel::quadtreeConstruction(ArrayPartition& pointPartition)
38 {
39 FMELocalContext* localContext = m_pLocalContext;
40 FMEGlobalContext* globalContext = m_pGlobalContext;
41 LinearQuadtree& tree = *globalContext->pQuadtree;
42
43 // precompute the bounding box for the quadtree points from the graph nodes
44 for_loop(pointPartition, min_max_x_function(localContext));
45 for_loop(pointPartition, min_max_y_function(localContext));
46
47 // wait until the thread's bounding box is computed
48 sync();
49
50 // let the main thread computed the bounding box of the bounding boxes
51 if (isMainThread())
52 {
53 globalContext->min_x = globalContext->pLocalContext[0]->min_x;
54 globalContext->min_y = globalContext->pLocalContext[0]->min_y;
55 globalContext->max_x = globalContext->pLocalContext[0]->max_x;
56 globalContext->max_y = globalContext->pLocalContext[0]->max_y;
57 for (uint32_t j=1; j < numThreads(); j++)
58 {
59 Math::updateMin(globalContext->min_x, globalContext->pLocalContext[j]->min_x);
60 Math::updateMin(globalContext->min_y, globalContext->pLocalContext[j]->min_y);
61 Math::updateMax(globalContext->max_x, globalContext->pLocalContext[j]->max_x);
62 Math::updateMax(globalContext->max_y, globalContext->pLocalContext[j]->max_y);
63 }
64 tree.init(globalContext->min_x, globalContext->min_y, globalContext->max_x, globalContext->max_y);
65 globalContext->coolDown *= 0.999f;
66 tree.clear();
67 }
68 // wait because the morton number computation needs the bounding box
69 sync();
70 // udpate morton number to prepare them for sorting
71 for_loop(pointPartition, LQMortonFunctor(localContext));
72 // wait so we can sort them by morton number
73 sync();
74
75 #ifdef OGDF_FME_PARALLEL_QUADTREE_SORT
76 // use a simple parallel sorting algorithm
77 LinearQuadtree::LQPoint* points = tree.pointArray();
78 sort_parallel(points, tree.numberOfPoints(), LQPointComparer);
79 #else
80 if (isMainThread())
81 {
82 LinearQuadtree::LQPoint* points = tree.pointArray();
83 sort_single(points, tree.numberOfPoints(), LQPointComparer);
84 }
85 #endif
86 // wait because the quadtree builder needs the sorted order
87 sync();
88 // if not a parallel run, we can do the easy way
89 if (isSingleThreaded())
90 {
91 LinearQuadtreeBuilder builder(tree);
92 // prepare the tree
93 builder.prepareTree();
94 // and link it
95 builder.build();
96 LQPartitioner partitioner( localContext );
97 partitioner.partition();
98 } else // the more difficult part
99 {
100 // snap the left point of the interval of the thread to the first in the cell
101 LinearQuadtree::PointID beginPoint = tree.findFirstPointInCell(pointPartition.begin);
102 LinearQuadtree::PointID endPoint_plus_one;
103 // if this thread is the last one, no snapping required for the right point
104 if (threadNr()==numThreads()-1)
105 endPoint_plus_one = tree.numberOfPoints();
106 else // find the left point of the next thread
107 endPoint_plus_one = tree.findFirstPointInCell(pointPartition.end+1);
108
109 // now we can prepare the snapped interval
110 LinearQuadtreeBuilder builder(tree);
111 // this function prepares the tree from begin point to endPoint_plus_one-1 (EXCLUDING endPoint_plus_one)
112 builder.prepareTree(beginPoint, endPoint_plus_one);
113 // save the start, end and count of the inner node chain in the context
114 localContext->firstInnerNode = builder.firstInner;
115 localContext->lastInnerNode = builder.lastInner;
116 localContext->numInnerNodes = builder.numInnerNodes;
117 // save the start, end and count of the leaf node chain in the context
118 localContext->firstLeaf = builder.firstLeaf;
119 localContext->lastLeaf = builder.lastLeaf;
120 localContext->numLeaves = builder.numLeaves;
121 // wait until all are finished
122 sync();
123
124 // now the main thread has to link the tree
125 if (isMainThread())
126 {
127 // with his own builder
128 LinearQuadtreeBuilder sbuilder(tree);
129 // first we need the complete chain data
130 sbuilder.firstInner = globalContext->pLocalContext[0]->firstInnerNode;
131 sbuilder.firstLeaf = globalContext->pLocalContext[0]->firstLeaf;
132 sbuilder.numInnerNodes = globalContext->pLocalContext[0]->numInnerNodes;
133 sbuilder.numLeaves = globalContext->pLocalContext[0]->numLeaves;
134 for (uint32_t j=1; j < numThreads(); j++)
135 {
136 sbuilder.numLeaves += globalContext->pLocalContext[j]->numLeaves;
137 sbuilder.numInnerNodes += globalContext->pLocalContext[j]->numInnerNodes;
138 }
139 sbuilder.lastInner = globalContext->pLocalContext[numThreads()-1]->lastInnerNode;
140 sbuilder.lastLeaf = globalContext->pLocalContext[numThreads()-1]->lastLeaf;
141 // Link the tree
142 sbuilder.build();
143 // and run the partitions
144 LQPartitioner partitioner(localContext);
145 partitioner.partition();
146 }
147 }
148 // wait for tree to finish
149 sync();
150 // now update the copy of the point data
151 for_loop(pointPartition, LQPointUpdateFunctor(localContext));
152 // compute the nodes coordinates and sizes
153 tree.forall_tree_nodes(LQCoordsFunctor(localContext), localContext->innerNodePartition.begin, localContext->innerNodePartition.numNodes)();
154 tree.forall_tree_nodes(LQCoordsFunctor(localContext), localContext->leafPartition.begin, localContext->leafPartition.numNodes)();
155 }
156
multipoleApproxSingleThreaded(ArrayPartition & nodePointPartition)157 void FMEMultipoleKernel::multipoleApproxSingleThreaded(ArrayPartition& nodePointPartition)
158 {
159 FMELocalContext* localContext = m_pLocalContext;
160 FMEGlobalContext* globalContext = m_pGlobalContext;
161 LinearQuadtree& tree = *globalContext->pQuadtree;
162 if (isMainThread())
163 {
164 tree.bottom_up_traversal( // do a bottom up traversal M2M pass
165 if_then_else(tree.is_leaf_condition(), // if the current node is a leaf
166 p2m_function(localContext), // then calculate the multipole coeff. due to the points in the leaf
167 m2m_function(localContext) // else shift the coefficents of all children to center of the inner node
168 )
169 )(tree.root());
170
171 tree.forall_well_separated_pairs( // do a wspd traversal M2L direct eval
172 pair_vice_versa(m2l_function(localContext)),// M2L for a well-separated pair
173 p2p_function(localContext), // direct evaluation
174 p2p_function(localContext) // direct evaluation
175 )(tree.root());
176
177 tree.top_down_traversal( // top down traversal
178 if_then_else( tree.is_leaf_condition(), // if the node is a leaf
179 do_nothing(), // then do nothing, we will deal with this case later
180 l2l_function(localContext) // else shift the nodes local coeffs to the children
181 )
182 )(tree.root());// start at the root
183
184 // evaluate all leaves and store the forces in the threads array
185 for_loop(nodePointPartition, // loop over points
186 func_comp( // composition of two statements
187 l2p_function(localContext), // evaluate the forces due to the local expansion in the corresponding leaf
188 collect_force_function // collect the forces of all threads with the following options:
189 <
190 FMECollect::RepulsiveFactor | // multiply by the repulsive factor stored in the global options
191 FMECollect::Tree2GraphOrder | // threads data is stored in quadtree leaf order, transform it into graph order
192 FMECollect::ZeroThreadArray // reset threads array
193 >(localContext)
194 )
195 );
196 }
197 }
198
199 //! the original algorithm which runs the WSPD completely single threaded
multipoleApproxSingleWSPD(ArrayPartition & nodePointPartition)200 void FMEMultipoleKernel::multipoleApproxSingleWSPD(ArrayPartition& nodePointPartition)
201 {
202 FMELocalContext* localContext = m_pLocalContext;
203 FMEGlobalContext* globalContext = m_pGlobalContext;
204 LinearQuadtree& tree = *globalContext->pQuadtree;
205
206 // let the main thread run the WSPD
207 if (isMainThread())
208 {
209 // The Well-separated pairs decomposition
210 tree.forall_well_separated_pairs( // do a wspd traversal
211 tree.StoreWSPairFunction(), // store the ws pairs in the WSPD
212 tree.StoreDirectPairFunction(), // store the direct pairs
213 tree.StoreDirectNodeFunction() // store the direct nodes
214 )(tree.root()); // start at the root
215 }
216
217 // Note: dont wait for the WSPD to finish. We dont need it yet for
218 // the big multihreaded bottom up traversal.
219 for_tree_partition( // for all roots in the threads tree partition
220 tree.bottom_up_traversal( // do a bottom up traversal
221 if_then_else(tree.is_leaf_condition(), // if the current node is a leaf
222 p2m_function(localContext), // then calculate the multipole coeff. due to the points in the leaf
223 m2m_function(localContext) // else shift the coefficents of all children to center of the inner node
224 )
225 )
226 );
227 sync();
228 // top of the tree has to be done by the main thread
229 if (isMainThread())
230 {
231 tree.bottom_up_traversal( // start a bottom up traversal
232 if_then_else(tree.is_leaf_condition(), // if the current node is a leaf
233 p2m_function(localContext), // then calculate the multipole coeff. due to the points in the leaf
234 m2m_function(localContext) // else shift the coefficents of all children to center of the inner node
235 ),
236 not_condition(tree.is_fence_condition()) // stop when the fence to the threads is reached
237 )(tree.root());// start at the root
238 }
239 // wait until all nodes in the quadtree have their mulipole coeff
240 sync();
241 // M2L pass with the WSPD
242 tree.forall_tree_nodes(M2LFunctor(localContext), localContext->innerNodePartition.begin, localContext->innerNodePartition.numNodes)();
243 tree.forall_tree_nodes(M2LFunctor(localContext), localContext->leafPartition.begin, localContext->leafPartition.numNodes)();
244 // D2D pass and store in the thread force array
245 for_loop(arrayPartition(tree.numberOfDirectPairs()), D2DFunctor(localContext));
246 // D pass and store in the thread force array
247 for_loop(arrayPartition(tree.numberOfDirectNodes()), NDFunctor(localContext));
248 // wait until all local coeffs and all direct forces are computed
249 sync();
250 // big multihreaded top down traversal. top of the tree has to be done by the main thread
251 if (isMainThread())
252 {
253 tree.top_down_traversal( // top down traversal
254 if_then_else( tree.is_leaf_condition(), // if the node is a leaf
255 do_nothing(), // then do nothing, we will deal with L2P pass later
256 l2l_function(localContext) // else shift the nodes local coeffs to the children
257 ),
258 not_condition(tree.is_fence_condition()) //stop when the fence to the threads is reached
259 )(tree.root());// start at the root
260 }
261 sync();
262 // the rest is done by the threads
263 for_tree_partition( // for all roots in the threads tree partition
264 tree.top_down_traversal( // do a top down traversal
265 if_then_else( tree.is_leaf_condition(), // if the node is a leaf
266 do_nothing(), // then do nothing, we will deal with L2P pass later
267 l2l_function(localContext) // else shift the nodes local coeffs to the children
268 )
269 )
270 );
271 // wait until the traversal is finished and all leaves have their accumulated local coeffs
272 sync();
273 // evaluate all leaves and store the forces in the threads array (Note: we can store them in the global array but then we have to use random access)
274 // we can start immediately to collect the forces because we evaluated before point by point
275 for_loop(nodePointPartition, // loop over threads points
276 func_comp( // composition of two statements
277 l2p_function(localContext), // evaluate the forces due to the local expansion in the corresponding leaf
278 collect_force_function // collect the forces of all threads with the following options:
279 <
280 FMECollect::RepulsiveFactor| // multiply by the repulsive factor stored in the global options
281 FMECollect::Tree2GraphOrder | // threads data is stored in quadtree leaf order, transform it into graph order
282 FMECollect::ZeroThreadArray // reset threads array to zero
283 >(localContext)
284 )
285 );
286 }
287
288
289 //! new but slower method, parallel wspd computation without using the wspd structure
multipoleApproxNoWSPDStructure(ArrayPartition & nodePointPartition)290 void FMEMultipoleKernel::multipoleApproxNoWSPDStructure(ArrayPartition& nodePointPartition)
291 {
292 FMELocalContext* localContext = m_pLocalContext;
293 FMEGlobalContext* globalContext = m_pGlobalContext;
294 LinearQuadtree& tree = *globalContext->pQuadtree;
295 // big multihreaded bottom up traversal.
296 for_tree_partition( // for all roots in the threads tree partition
297 tree.bottom_up_traversal( // do a bottom up traversal
298 if_then_else(tree.is_leaf_condition(), // if the current node is a leaf
299 p2m_function(localContext), // then calculate the multipole coeff. due to the points in the leaf
300 m2m_function(localContext) // else shift the coefficents of all children to center of the inner node
301 )
302 )
303 );
304 sync();
305
306 // top of the tree has to be done by the main thread
307 if (isMainThread())
308 {
309 tree.bottom_up_traversal( // start a bottom up traversal
310 if_then_else(tree.is_leaf_condition(), // if the current node is a leaf
311 p2m_function(localContext), // then calculate the multipole coeff. due to the points in the leaf
312 m2m_function(localContext) // else shift the coefficents of all children to center of the inner node
313 ),
314 not_condition(tree.is_fence_condition()))(tree.root());// start at the root, stop when the fence to the threads is reached
315
316 tree.forall_well_separated_pairs( // do a wspd traversal
317 pair_vice_versa(m2l_function(localContext)), // M2L for a well-separated pair
318 p2p_function(localContext), // direct evaluation
319 p2p_function(localContext), // direct evaluation
320 not_condition(tree.is_fence_condition()))(tree.root());
321 }
322 // wait until all local coeffs and all direct forces are computed
323 sync();
324
325 // now a wspd traversal for the roots in the tree partition
326 for_tree_partition(
327 tree.forall_well_separated_pairs( // do a wspd traversal
328 pair_vice_versa(m2l_function(localContext)), // M2L for a well-separated pair
329 p2p_function(localContext), // direct evaluation
330 p2p_function(localContext) // direct evaluation
331 )
332 );
333 // wait until all local coeffs and all direct forces are computed
334 sync();
335
336 // big multihreaded top down traversal. Top of the tree has to be done by the main thread
337 if (isMainThread())
338 {
339 tree.top_down_traversal( // top down traversal
340 if_then_else( tree.is_leaf_condition(), // if the node is a leaf
341 do_nothing(), // then do nothing, we will deal with this case later
342 l2l_function(localContext) // else shift the nodes local coeffs to the children
343 ),
344 not_condition(tree.is_fence_condition()) //stop when the fence to the threads is reached
345 )(tree.root()); // start at the root
346 }
347 // wait until the traversal is finished and all roots of the threads have their coefficients
348 sync();
349
350 for_tree_partition( // for all roots in the threads tree partition
351 tree.top_down_traversal( // do a top down traversal
352 if_then_else( tree.is_leaf_condition(), // if the node is a leaf
353 do_nothing(), // then do nothing, we will deal with this case later
354 l2l_function(localContext) // else shift the nodes local coeffs to the children
355 )
356 )
357 );
358 // wait until the traversal is finished and all leaves have their accumulated local coeffs
359 sync();
360 // evaluate all leaves and store the forces in the threads array (Note we can store them in the global array but then we have to use random access)
361 // we can start immediately to collect the forces because we evaluated before point by point
362 for_loop(nodePointPartition, // loop over threads points
363 func_comp( // composition of two statements
364 l2p_function(localContext), // evaluate the forces due to the local expansion in the corresponding leaf
365 collect_force_function // collect the forces of all threads with the following options:
366 <
367 FMECollect::RepulsiveFactor | // multiply by the repulsive factor stored in the global options
368 FMECollect::Tree2GraphOrder | // threads data is stored in quadtree leaf order, transform it into graph order
369 FMECollect::ZeroThreadArray // reset threads array
370 >(localContext)
371 )
372 );
373 }
374
375
376 //! the final approximation algorithm which runs the wspd parallel without storing it in the threads subtrees
multipoleApproxFinal(ArrayPartition & nodePointPartition)377 void FMEMultipoleKernel::multipoleApproxFinal(ArrayPartition& nodePointPartition)
378 {
379 FMELocalContext* localContext = m_pLocalContext;
380 FMEGlobalContext* globalContext = m_pGlobalContext;
381 LinearQuadtree& tree = *globalContext->pQuadtree;
382 // big multihreaded bottom up traversal.
383 for_tree_partition( // for all roots in the threads tree partition
384 tree.bottom_up_traversal( // do a bottom up traversal
385 if_then_else(tree.is_leaf_condition(), // if the current node is a leaf
386 p2m_function(localContext), // then calculate the multipole coeff. due to the points in the leaf
387 m2m_function(localContext) // else shift the coefficents of all children to center of the inner node
388 )
389 )
390 );
391 sync();
392 // top of the tree has to be done by the main thread
393 if (isMainThread())
394 {
395 tree.bottom_up_traversal( // start a bottom up traversal
396 if_then_else(tree.is_leaf_condition(), // if the current node is a leaf
397 p2m_function(localContext), // then calculate the multipole coeff. due to the points in the leaf
398 m2m_function(localContext) // else shift the coefficents of all children to center of the inner node
399 ),
400 not_condition(tree.is_fence_condition()))(tree.root());// start at the root, stop when the fence to the threads is reached
401
402 tree.forall_well_separated_pairs( // do a wspd traversal
403 tree.StoreWSPairFunction(), // store the ws pairs in the WSPD
404 tree.StoreDirectPairFunction(), // store the direct pairs
405 tree.StoreDirectNodeFunction(), // store the direct nodes
406 not_condition(tree.is_fence_condition()))(tree.root());
407 }
408 // wait for the main thread to finish
409 sync();
410
411 // M2L pass with the WSPD for the result of the single threaded pass above
412 tree.forall_tree_nodes(M2LFunctor(localContext), localContext->innerNodePartition.begin, localContext->innerNodePartition.numNodes)();
413 tree.forall_tree_nodes(M2LFunctor(localContext), localContext->leafPartition.begin, localContext->leafPartition.numNodes)();
414
415 // D2D pass and store in the thread force array
416 for_loop(arrayPartition(tree.numberOfDirectPairs()), D2DFunctor(localContext));
417 for_loop(arrayPartition(tree.numberOfDirectNodes()), NDFunctor(localContext));
418
419 // wait until all local coeffs and all direct forces are computed
420 sync();
421
422 // the rest of the WSPD can be done on the fly by the thread
423 for_tree_partition(
424 tree.forall_well_separated_pairs( // do a wspd traversal
425 pair_vice_versa(m2l_function(localContext)), // M2L for a well-separated pair
426 p2p_function(localContext), // direct evaluation
427 p2p_function(localContext) // direct evaluation
428 )
429 );
430 // wait until all local coeffs and all direct forces are computed
431 sync();
432
433 // big multihreaded top down traversal. top of the tree has to be done by the main thread
434 if (isMainThread())
435 {
436 tree.top_down_traversal( // top down traversal L2L pass
437 if_then_else( tree.is_leaf_condition(), // if the node is a leaf
438 do_nothing(), // then do nothing, we will deal with this case later
439 l2l_function(localContext) // else shift the nodes local coeffs to the children
440 ),
441 not_condition(tree.is_fence_condition()) // stop when the fence to the threads is reached
442 )(tree.root()); // start at the root,
443 }
444 // wait for the top of the tree
445 sync();
446
447 for_tree_partition( // for all roots in the threads tree partition L2L pass
448 tree.top_down_traversal( // do a top down traversal
449 if_then_else( tree.is_leaf_condition(), // if the node is a leaf
450 do_nothing(), // then do nothing, we will deal with this case later
451 l2l_function(localContext) // else shift the nodes local coeffs to the children
452 )
453 )
454 );
455 // wait until the traversal is finished and all leaves have their accumulated local coeffs
456 sync();
457 // evaluate all leaves and store the forces in the threads array (Note we can store them in the global array but then we have to use random access)
458 // we can start immediately to collect the forces because we evaluated before point by point
459 for_loop(nodePointPartition, // loop over threads points
460 func_comp( // composition of two statements
461 l2p_function(localContext), // evaluate the forces due to the local expansion in the corresponding leaf
462 collect_force_function // collect the forces of all threads with the following options:
463 <
464 FMECollect::RepulsiveFactor | // multiply by the repulsive factor stored in the global options
465 FMECollect::Tree2GraphOrder | // threads data is stored in quadtree leaf order, transform it into graph order
466 FMECollect::ZeroThreadArray // reset threads array
467 >(localContext)
468 )
469 );
470 }
471
operator ()(FMEGlobalContext * globalContext)472 void FMEMultipoleKernel::operator()(FMEGlobalContext* globalContext)
473 {
474 uint32_t maxNumIterations = globalContext->pOptions->maxNumIterations;
475 uint32_t minNumIterations = globalContext->pOptions->minNumIterations;
476 ArrayGraph& graph = *globalContext->pGraph;
477 LinearQuadtree& tree = *globalContext->pQuadtree;
478 LinearQuadtreeExpansion& treeExp = *globalContext->pExpansion;
479 FMELocalContext* localContext = globalContext->pLocalContext[threadNr()];
480 FMEGlobalOptions* options = globalContext->pOptions;
481 float* threadsForceArrayX = localContext->forceX;
482 float* threadsForceArrayY = localContext->forceY;
483 float* globalForceArrayX = globalContext->globalForceX;
484 float* globalForceArrayY = globalContext->globalForceY;
485
486 ArrayPartition edgePartition = arrayPartition(graph.numEdges());
487 ArrayPartition nodePointPartition = arrayPartition(graph.numNodes());
488
489 m_pLocalContext = localContext;
490 m_pGlobalContext = globalContext;
491
492 // INIT
493
494 //! reset the global force array
495 for_loop_array_set(threadNr(), numThreads(), globalForceArrayX, tree.numberOfPoints(), 0.0f);
496 for_loop_array_set(threadNr(), numThreads(), globalForceArrayY, tree.numberOfPoints(), 0.0f);
497
498 // reset the threads force array
499 for (uint32_t i = 0; i < tree.numberOfPoints(); i++)
500 {
501 threadsForceArrayX[i] = 0.0f;
502 threadsForceArrayY[i] = 0.0f;
503 }
504
505 uint32_t maxNumIt = options->preProcMaxNumIterations;
506 for (uint32_t currNumIteration = 0; currNumIteration < maxNumIt; currNumIteration++)
507 {
508 // iterate over all edges and store the resulting forces in the threads array
509 for_loop(edgePartition,
510 edge_force_function< static_cast<int>(FMEEdgeForce::DivDegree) > (localContext) // divide the forces by degree of the node to avoid oscilation
511 );
512 // wait until all edges are done
513 sync();
514 // now collect the forces in parallel and put the sum into the global array and move the nodes accordingly
515 for_loop(nodePointPartition,
516 func_comp(
517 collect_force_function<FMECollect::EdgeFactorRep | FMECollect::ZeroThreadArray >(localContext),
518 node_move_function<TIME_STEP_PREP | ZERO_GLOBAL_ARRAY>(localContext)
519 )
520 );
521 }
522 if (isMainThread())
523 {
524 globalContext->coolDown = 1.0f;
525 }
526 sync();
527
528 for (uint32_t currNumIteration = 0; currNumIteration < maxNumIterations && !globalContext->earlyExit; currNumIteration++)
529 {
530 // reset the coefficients
531 for_loop_array_set(threadNr(), numThreads(), treeExp.m_multiExp, treeExp.m_numExp*(treeExp.m_numCoeff << 1), 0.0);
532 for_loop_array_set(threadNr(), numThreads(), treeExp.m_localExp, treeExp.m_numExp*(treeExp.m_numCoeff << 1), 0.0);
533
534 localContext->maxForceSq = 0.0;
535 localContext->avgForce = 0.0;
536
537 // construct the quadtree
538 quadtreeConstruction(nodePointPartition);
539 // wait for all threads to finish
540 sync();
541
542 if (isSingleThreaded()) // if is single threaded run the simple approximation
543 multipoleApproxSingleThreaded(nodePointPartition);
544 else // otherwise use the partitioning
545 multipoleApproxFinal(nodePointPartition);
546 // now wait until all forces are summed up in the global array and mapped to graph node order
547 sync();
548
549 // run the edge forces
550 for_loop(edgePartition, // iterate over all edges and sum up the forces in the threads array
551 edge_force_function< static_cast<int>(FMEEdgeForce::DivDegree) >(localContext) // divide the forces by degree of the node to avoid oscilation
552 );
553 // wait until edges are finished
554 sync();
555
556 // collect the edge forces and move nodes without waiting
557 for_loop(nodePointPartition,
558 func_comp(
559 collect_force_function<FMECollect::EdgeFactorRep | FMECollect::ZeroThreadArray>(localContext),
560 node_move_function<TIME_STEP_NORMAL | ZERO_GLOBAL_ARRAY>(localContext)
561 )
562 );
563 // wait so we can decide if we need another iteration
564 sync();
565 // check the max force square for all threads
566 if (isMainThread())
567 {
568 double maxForceSq = 0.0;
569 for (uint32_t j=0; j < numThreads(); j++)
570 Math::updateMax(maxForceSq, globalContext->pLocalContext[j]->maxForceSq);
571
572 // if we are allowed to quit and the max force sq falls under the threshold tell all threads we are done
573 if ((currNumIteration >= minNumIterations) && (maxForceSq < globalContext->pOptions->stopCritForce ))
574 {
575 globalContext->earlyExit = true;
576 }
577 }
578 // this is required to wait for the earlyExit result
579 sync();
580 }
581 }
582
583 //! allcates the global context and for all threads a local context
allocateContext(ArrayGraph * pGraph,FMEGlobalOptions * pOptions,uint32_t numThreads)584 FMEGlobalContext* FMEMultipoleKernel::allocateContext(ArrayGraph* pGraph, FMEGlobalOptions* pOptions, uint32_t numThreads)
585 {
586 FMEGlobalContext* globalContext = new FMEGlobalContext();
587
588 globalContext->numThreads = numThreads;
589 globalContext->pOptions = pOptions;
590 globalContext->pGraph = pGraph;
591 globalContext->pQuadtree = new LinearQuadtree(pGraph->numNodes(), pGraph->nodeXPos(), pGraph->nodeYPos(), pGraph->nodeSize());
592 globalContext->pWSPD = globalContext->pQuadtree->wspd();
593 globalContext->pExpansion = new LinearQuadtreeExpansion(globalContext->pOptions->multipolePrecision, (*globalContext->pQuadtree));
594 uint32_t numPoints = globalContext->pQuadtree->numberOfPoints();
595 using FMELocalContextPtr = FMELocalContext*;
596
597 globalContext->pLocalContext = new FMELocalContextPtr[numThreads];
598 globalContext->globalForceX = (float*)OGDF_MALLOC_16(sizeof(float)*numPoints);
599 globalContext->globalForceY = (float*)OGDF_MALLOC_16(sizeof(float)*numPoints);
600 for (uint32_t i=0; i < numThreads; i++)
601 {
602 globalContext->pLocalContext[i] = new FMELocalContext;
603 globalContext->pLocalContext[i]->forceX = (float*)OGDF_MALLOC_16(sizeof(float)*numPoints);
604 globalContext->pLocalContext[i]->forceY = (float*)OGDF_MALLOC_16(sizeof(float)*numPoints);
605 globalContext->pLocalContext[i]->pGlobalContext = globalContext;
606 }
607 return globalContext;
608 }
609
610 //! frees the memory for all contexts
deallocateContext(FMEGlobalContext * globalContext)611 void FMEMultipoleKernel::deallocateContext(FMEGlobalContext* globalContext)
612 {
613 uint32_t numThreads = globalContext->numThreads;
614 for (uint32_t i=0; i < numThreads; i++)
615 {
616 OGDF_FREE_16(globalContext->pLocalContext[i]->forceX);
617 OGDF_FREE_16(globalContext->pLocalContext[i]->forceY);
618 delete globalContext->pLocalContext[i];
619 }
620 OGDF_FREE_16(globalContext->globalForceX);
621 OGDF_FREE_16(globalContext->globalForceY);
622 delete[] globalContext->pLocalContext;
623 delete globalContext->pExpansion;
624 delete globalContext->pQuadtree;
625 delete globalContext;
626 }
627
628 }
629 }
630