1 /** \file
2  * \brief Implementation of class FMEMultipoleKernel.
3  *
4  * \author Martin Gronemann
5  *
6  * \par License:
7  * This file is part of the Open Graph Drawing Framework (OGDF).
8  *
9  * \par
10  * Copyright (C)<br>
11  * See README.md in the OGDF root directory for details.
12  *
13  * \par
14  * This program is free software; you can redistribute it and/or
15  * modify it under the terms of the GNU General Public License
16  * Version 2 or 3 as published by the Free Software Foundation;
17  * see the file LICENSE.txt included in the packaging of this file
18  * for details.
19  *
20  * \par
21  * This program is distributed in the hope that it will be useful,
22  * but WITHOUT ANY WARRANTY; without even the implied warranty of
23  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
24  * GNU General Public License for more details.
25  *
26  * \par
27  * You should have received a copy of the GNU General Public
28  * License along with this program; if not, see
29  * http://www.gnu.org/copyleft/gpl.html
30  */
31 
32 #include <ogdf/energybased/fast_multipole_embedder/FMEMultipoleKernel.h>
33 
34 namespace ogdf {
35 namespace fast_multipole_embedder {
36 
quadtreeConstruction(ArrayPartition & pointPartition)37 void FMEMultipoleKernel::quadtreeConstruction(ArrayPartition& pointPartition)
38 {
39 	FMELocalContext*  localContext	= m_pLocalContext;
40 	FMEGlobalContext* globalContext = m_pGlobalContext;
41 	LinearQuadtree&	tree			= *globalContext->pQuadtree;
42 
43 	// precompute the bounding box for the quadtree points from the graph nodes
44 	for_loop(pointPartition, min_max_x_function(localContext));
45 	for_loop(pointPartition, min_max_y_function(localContext));
46 
47 	// wait until the thread's bounding box is computed
48 	sync();
49 
50 	// let the main thread computed the bounding box of the bounding boxes
51 	if (isMainThread())
52 	{
53 		globalContext->min_x = globalContext->pLocalContext[0]->min_x;
54 		globalContext->min_y = globalContext->pLocalContext[0]->min_y;
55 		globalContext->max_x = globalContext->pLocalContext[0]->max_x;
56 		globalContext->max_y = globalContext->pLocalContext[0]->max_y;
57 		for (uint32_t j=1; j < numThreads(); j++)
58 		{
59 			Math::updateMin(globalContext->min_x, globalContext->pLocalContext[j]->min_x);
60 			Math::updateMin(globalContext->min_y, globalContext->pLocalContext[j]->min_y);
61 			Math::updateMax(globalContext->max_x, globalContext->pLocalContext[j]->max_x);
62 			Math::updateMax(globalContext->max_y, globalContext->pLocalContext[j]->max_y);
63 		}
64 		tree.init(globalContext->min_x, globalContext->min_y, globalContext->max_x, globalContext->max_y);
65 		globalContext->coolDown *= 0.999f;
66 		tree.clear();
67 	}
68 	// wait because the morton number computation needs the bounding box
69 	sync();
70 	// udpate morton number to prepare them for sorting
71 	for_loop(pointPartition, LQMortonFunctor(localContext));
72 	// wait so we can sort them by morton number
73 	sync();
74 
75 #ifdef OGDF_FME_PARALLEL_QUADTREE_SORT
76 	// use a simple parallel sorting algorithm
77 	LinearQuadtree::LQPoint* points = tree.pointArray();
78 	sort_parallel(points, tree.numberOfPoints(), LQPointComparer);
79 #else
80 	if (isMainThread())
81 	{
82 		LinearQuadtree::LQPoint* points = tree.pointArray();
83 		sort_single(points, tree.numberOfPoints(), LQPointComparer);
84 	}
85 #endif
86 	// wait because the quadtree builder needs the sorted order
87 	sync();
88 	// if not a parallel run, we can do the easy way
89 	if (isSingleThreaded())
90 	{
91 		LinearQuadtreeBuilder builder(tree);
92 		// prepare the tree
93 		builder.prepareTree();
94 		// and link it
95 		builder.build();
96 		LQPartitioner partitioner( localContext );
97 		partitioner.partition();
98 	} else // the more difficult part
99 	{
100 		// snap the left point of the interval of the thread to the first in the cell
101 		LinearQuadtree::PointID beginPoint = tree.findFirstPointInCell(pointPartition.begin);
102 		LinearQuadtree::PointID endPoint_plus_one;
103 		// if this thread is the last one, no snapping required for the right point
104 		if (threadNr()==numThreads()-1)
105 			endPoint_plus_one = tree.numberOfPoints();
106 		else // find the left point of the next thread
107 			endPoint_plus_one = tree.findFirstPointInCell(pointPartition.end+1);
108 
109 		// now we can prepare the snapped interval
110 		LinearQuadtreeBuilder builder(tree);
111 		// this function prepares the tree from begin point to endPoint_plus_one-1 (EXCLUDING endPoint_plus_one)
112 		builder.prepareTree(beginPoint, endPoint_plus_one);
113 		// save the start, end and count of the inner node chain in the context
114 		localContext->firstInnerNode = builder.firstInner;
115 		localContext->lastInnerNode = builder.lastInner;
116 		localContext->numInnerNodes = builder.numInnerNodes;
117 		// save the start, end and count of the leaf node chain in the context
118 		localContext->firstLeaf = builder.firstLeaf;
119 		localContext->lastLeaf = builder.lastLeaf;
120 		localContext->numLeaves = builder.numLeaves;
121 		// wait until all are finished
122 		sync();
123 
124 		// now the main thread has to link the tree
125 		if (isMainThread())
126 		{
127 			// with his own builder
128 			LinearQuadtreeBuilder sbuilder(tree);
129 			// first we need the complete chain data
130 			sbuilder.firstInner = globalContext->pLocalContext[0]->firstInnerNode;
131 			sbuilder.firstLeaf = globalContext->pLocalContext[0]->firstLeaf;
132 			sbuilder.numInnerNodes = globalContext->pLocalContext[0]->numInnerNodes;
133 			sbuilder.numLeaves = globalContext->pLocalContext[0]->numLeaves;
134 			for (uint32_t j=1; j < numThreads(); j++)
135 			{
136 				sbuilder.numLeaves += globalContext->pLocalContext[j]->numLeaves;
137 				sbuilder.numInnerNodes += globalContext->pLocalContext[j]->numInnerNodes;
138 			}
139 			sbuilder.lastInner = globalContext->pLocalContext[numThreads()-1]->lastInnerNode;
140 			sbuilder.lastLeaf = globalContext->pLocalContext[numThreads()-1]->lastLeaf;
141 			// Link the tree
142 			sbuilder.build();
143 			// and run the partitions
144 			LQPartitioner partitioner(localContext);
145 			partitioner.partition();
146 		}
147 	}
148 	// wait for tree to finish
149 	sync();
150 	// now update the copy of the point data
151 	for_loop(pointPartition, LQPointUpdateFunctor(localContext));
152 	// compute the nodes coordinates and sizes
153 	tree.forall_tree_nodes(LQCoordsFunctor(localContext), localContext->innerNodePartition.begin, localContext->innerNodePartition.numNodes)();
154 	tree.forall_tree_nodes(LQCoordsFunctor(localContext), localContext->leafPartition.begin, localContext->leafPartition.numNodes)();
155 }
156 
multipoleApproxSingleThreaded(ArrayPartition & nodePointPartition)157 void FMEMultipoleKernel::multipoleApproxSingleThreaded(ArrayPartition& nodePointPartition)
158 {
159 	FMELocalContext*  localContext	= m_pLocalContext;
160 	FMEGlobalContext* globalContext = m_pGlobalContext;
161 	LinearQuadtree&	tree			= *globalContext->pQuadtree;
162 	if (isMainThread())
163 	{
164 		tree.bottom_up_traversal( // do a bottom up traversal M2M pass
165 			if_then_else(tree.is_leaf_condition(), // if the current node is a leaf
166 				p2m_function(localContext), // then calculate the multipole coeff. due to the points in the leaf
167 				m2m_function(localContext) // else shift the coefficents of all children to center of the inner node
168 			)
169 		)(tree.root());
170 
171 		tree.forall_well_separated_pairs( // do a wspd traversal M2L direct eval
172 			pair_vice_versa(m2l_function(localContext)),// M2L for a well-separated pair
173 			p2p_function(localContext), // direct evaluation
174 			p2p_function(localContext) // direct evaluation
175 		)(tree.root());
176 
177 		tree.top_down_traversal( // top down traversal
178 			if_then_else( tree.is_leaf_condition(), // if the node is a leaf
179 				do_nothing(), // then do nothing, we will deal with this case later
180 				l2l_function(localContext) // else shift the nodes local coeffs to the children
181 			)
182 		)(tree.root());// start at the root
183 
184 		// evaluate all leaves and store the forces in the threads array
185 		for_loop(nodePointPartition, // loop over points
186 			func_comp( // composition of two statements
187 				l2p_function(localContext), // evaluate the forces due to the local expansion in the corresponding leaf
188 				collect_force_function // collect the forces of all threads with the following options:
189 				<
190 					FMECollect::RepulsiveFactor | // multiply by the repulsive factor stored in the global options
191 					FMECollect::Tree2GraphOrder | // threads data is stored in quadtree leaf order, transform it into graph order
192 					FMECollect::ZeroThreadArray // reset threads array
193 				>(localContext)
194 			)
195 		);
196 	}
197 }
198 
199 //! the original algorithm which runs the WSPD completely single threaded
multipoleApproxSingleWSPD(ArrayPartition & nodePointPartition)200 void FMEMultipoleKernel::multipoleApproxSingleWSPD(ArrayPartition& nodePointPartition)
201 {
202 	FMELocalContext*  localContext	= m_pLocalContext;
203 	FMEGlobalContext* globalContext = m_pGlobalContext;
204 	LinearQuadtree&	tree			= *globalContext->pQuadtree;
205 
206 	// let the main thread run the WSPD
207 	if (isMainThread())
208 	{
209 		// The Well-separated pairs decomposition
210 		tree.forall_well_separated_pairs( // do a wspd traversal
211 				tree.StoreWSPairFunction(), // store the ws pairs in the WSPD
212 				tree.StoreDirectPairFunction(), // store the direct pairs
213 				tree.StoreDirectNodeFunction() // store the direct nodes
214 			)(tree.root()); // start at the root
215 	}
216 
217 	// Note: dont wait for the WSPD to finish. We dont need it yet for
218 	// the big multihreaded bottom up traversal.
219 	for_tree_partition( // for all roots in the threads tree partition
220 		tree.bottom_up_traversal( // do a bottom up traversal
221 			if_then_else(tree.is_leaf_condition(), // if the current node is a leaf
222 				p2m_function(localContext), // then calculate the multipole coeff. due to the points in the leaf
223 				m2m_function(localContext) // else shift the coefficents of all children to center of the inner node
224 			)
225 		)
226 	);
227 	sync();
228 	// top of the tree has to be done by the main thread
229 	if (isMainThread())
230 	{
231 		tree.bottom_up_traversal( // start a bottom up traversal
232 				if_then_else(tree.is_leaf_condition(), // if the current node is a leaf
233 					p2m_function(localContext), // then calculate the multipole coeff. due to the points in the leaf
234 					m2m_function(localContext) // else shift the coefficents of all children to center of the inner node
235 				),
236 				not_condition(tree.is_fence_condition()) // stop when the fence to the threads is reached
237 			)(tree.root());// start at the root
238 	}
239 	// wait until all nodes in the quadtree have their mulipole coeff
240 	sync();
241 	// M2L pass with the WSPD
242 	tree.forall_tree_nodes(M2LFunctor(localContext), localContext->innerNodePartition.begin, localContext->innerNodePartition.numNodes)();
243 	tree.forall_tree_nodes(M2LFunctor(localContext), localContext->leafPartition.begin, localContext->leafPartition.numNodes)();
244 	// D2D pass and store in the thread force array
245 	for_loop(arrayPartition(tree.numberOfDirectPairs()), D2DFunctor(localContext));
246 	// D pass and store in the thread force array
247 	for_loop(arrayPartition(tree.numberOfDirectNodes()), NDFunctor(localContext));
248 	// wait until all local coeffs and all direct forces are computed
249 	sync();
250 	// big multihreaded top down traversal. top of the tree has to be done by the main thread
251 	if (isMainThread())
252 	{
253 		tree.top_down_traversal( // top down traversal
254 			if_then_else( tree.is_leaf_condition(), // if the node is a leaf
255 				do_nothing(), // then do nothing, we will deal with L2P pass later
256 				l2l_function(localContext) // else shift the nodes local coeffs to the children
257 			),
258 			not_condition(tree.is_fence_condition()) //stop when the fence to the threads is reached
259 		)(tree.root());// start at the root
260 	}
261 	sync();
262 	// the rest is done by the threads
263 	for_tree_partition( // for all roots in the threads tree partition
264 		tree.top_down_traversal( // do a top down traversal
265 			if_then_else( tree.is_leaf_condition(), // if the node is a leaf
266 				do_nothing(), // then do nothing, we will deal with L2P pass later
267 				l2l_function(localContext) // else shift the nodes local coeffs to the children
268 			)
269 		)
270 	);
271 	// wait until the traversal is finished and all leaves have their accumulated local coeffs
272 	sync();
273 	// evaluate all leaves and store the forces in the threads array (Note: we can store them in the global array but then we have to use random access)
274 	// we can start immediately to collect the forces because we evaluated before point by point
275 	for_loop(nodePointPartition, // loop over threads points
276 		func_comp( // composition of two statements
277 			l2p_function(localContext), // evaluate the forces due to the local expansion in the corresponding leaf
278 			collect_force_function // collect the forces of all threads with the following options:
279 			<
280 				FMECollect::RepulsiveFactor| // multiply by the repulsive factor stored in the global options
281 				FMECollect::Tree2GraphOrder | // threads data is stored in quadtree leaf order, transform it into graph order
282 				FMECollect::ZeroThreadArray // reset threads array to zero
283 			>(localContext)
284 		)
285 	);
286 }
287 
288 
289 //! new but slower method, parallel wspd computation without using the wspd structure
multipoleApproxNoWSPDStructure(ArrayPartition & nodePointPartition)290 void FMEMultipoleKernel::multipoleApproxNoWSPDStructure(ArrayPartition& nodePointPartition)
291 {
292 	FMELocalContext*  localContext	= m_pLocalContext;
293 	FMEGlobalContext* globalContext = m_pGlobalContext;
294 	LinearQuadtree&	tree			= *globalContext->pQuadtree;
295 	// big multihreaded bottom up traversal.
296 	for_tree_partition( // for all roots in the threads tree partition
297 		tree.bottom_up_traversal( // do a bottom up traversal
298 			if_then_else(tree.is_leaf_condition(), // if the current node is a leaf
299 				p2m_function(localContext), // then calculate the multipole coeff. due to the points in the leaf
300 				m2m_function(localContext) // else shift the coefficents of all children to center of the inner node
301 			)
302 		)
303 	);
304 	sync();
305 
306 	// top of the tree has to be done by the main thread
307 	if (isMainThread())
308 	{
309 		tree.bottom_up_traversal( // start a bottom up traversal
310 			if_then_else(tree.is_leaf_condition(), // if the current node is a leaf
311 				p2m_function(localContext), // then calculate the multipole coeff. due to the points in the leaf
312 				m2m_function(localContext) // else shift the coefficents of all children to center of the inner node
313 			),
314 			not_condition(tree.is_fence_condition()))(tree.root());// start at the root, stop when the fence to the threads is reached
315 
316 		tree.forall_well_separated_pairs( // do a wspd traversal
317 			pair_vice_versa(m2l_function(localContext)), // M2L for a well-separated pair
318 			p2p_function(localContext), // direct evaluation
319 			p2p_function(localContext), // direct evaluation
320 			not_condition(tree.is_fence_condition()))(tree.root());
321 	}
322 	// wait until all local coeffs and all direct forces are computed
323 	sync();
324 
325 	// now a wspd traversal for the roots in the tree partition
326 	for_tree_partition(
327 		tree.forall_well_separated_pairs( // do a wspd traversal
328 			pair_vice_versa(m2l_function(localContext)), // M2L for a well-separated pair
329 			p2p_function(localContext), // direct evaluation
330 			p2p_function(localContext) // direct evaluation
331 		)
332 	);
333 	// wait until all local coeffs and all direct forces are computed
334 	sync();
335 
336 	// big multihreaded top down traversal. Top of the tree has to be done by the main thread
337 	if (isMainThread())
338 	{
339 		tree.top_down_traversal( // top down traversal
340 			if_then_else( tree.is_leaf_condition(), // if the node is a leaf
341 				do_nothing(), // then do nothing, we will deal with this case later
342 				l2l_function(localContext) // else shift the nodes local coeffs to the children
343 			),
344 			not_condition(tree.is_fence_condition()) //stop when the fence to the threads is reached
345 		)(tree.root());								 // start at the root
346 	}
347 	// wait until the traversal is finished and all roots of the threads have their coefficients
348 	sync();
349 
350 	for_tree_partition( // for all roots in the threads tree partition
351 		tree.top_down_traversal( // do a top down traversal
352 			if_then_else( tree.is_leaf_condition(), // if the node is a leaf
353 				do_nothing(), // then do nothing, we will deal with this case later
354 				l2l_function(localContext) // else shift the nodes local coeffs to the children
355 			)
356 		)
357 	);
358 	// wait until the traversal is finished and all leaves have their accumulated local coeffs
359 	sync();
360 	// evaluate all leaves and store the forces in the threads array (Note we can store them in the global array but then we have to use random access)
361 	// we can start immediately to collect the forces because we evaluated before point by point
362 	for_loop(nodePointPartition, // loop over threads points
363 		func_comp( // composition of two statements
364 			l2p_function(localContext), // evaluate the forces due to the local expansion in the corresponding leaf
365 			collect_force_function // collect the forces of all threads with the following options:
366 			<
367 				FMECollect::RepulsiveFactor | // multiply by the repulsive factor stored in the global options
368 				FMECollect::Tree2GraphOrder | // threads data is stored in quadtree leaf order, transform it into graph order
369 				FMECollect::ZeroThreadArray // reset threads array
370 			>(localContext)
371 		)
372 	);
373 }
374 
375 
376 //! the final approximation algorithm which runs the wspd parallel without storing it in the threads subtrees
multipoleApproxFinal(ArrayPartition & nodePointPartition)377 void FMEMultipoleKernel::multipoleApproxFinal(ArrayPartition& nodePointPartition)
378 {
379 	FMELocalContext*  localContext	= m_pLocalContext;
380 	FMEGlobalContext* globalContext = m_pGlobalContext;
381 	LinearQuadtree&	tree			= *globalContext->pQuadtree;
382 	// big multihreaded bottom up traversal.
383 	for_tree_partition( // for all roots in the threads tree partition
384 		tree.bottom_up_traversal( // do a bottom up traversal
385 			if_then_else(tree.is_leaf_condition(), // if the current node is a leaf
386 				p2m_function(localContext), // then calculate the multipole coeff. due to the points in the leaf
387 				m2m_function(localContext) // else shift the coefficents of all children to center of the inner node
388 			)
389 		)
390 	);
391 	sync();
392 	// top of the tree has to be done by the main thread
393 	if (isMainThread())
394 	{
395 		tree.bottom_up_traversal( // start a bottom up traversal
396 			if_then_else(tree.is_leaf_condition(), // if the current node is a leaf
397 				p2m_function(localContext), // then calculate the multipole coeff. due to the points in the leaf
398 				m2m_function(localContext) // else shift the coefficents of all children to center of the inner node
399 			),
400 			not_condition(tree.is_fence_condition()))(tree.root());// start at the root, stop when the fence to the threads is reached
401 
402 		tree.forall_well_separated_pairs( // do a wspd traversal
403 			tree.StoreWSPairFunction(), // store the ws pairs in the WSPD
404 			tree.StoreDirectPairFunction(), // store the direct pairs
405 			tree.StoreDirectNodeFunction(), // store the direct nodes
406 			not_condition(tree.is_fence_condition()))(tree.root());
407 	}
408 	// wait for the main thread to finish
409 	sync();
410 
411 	// M2L pass with the WSPD for the result of the single threaded pass above
412 	tree.forall_tree_nodes(M2LFunctor(localContext), localContext->innerNodePartition.begin, localContext->innerNodePartition.numNodes)();
413 	tree.forall_tree_nodes(M2LFunctor(localContext), localContext->leafPartition.begin, localContext->leafPartition.numNodes)();
414 
415 	// D2D pass and store in the thread force array
416 	for_loop(arrayPartition(tree.numberOfDirectPairs()), D2DFunctor(localContext));
417 	for_loop(arrayPartition(tree.numberOfDirectNodes()), NDFunctor(localContext));
418 
419 	// wait until all local coeffs and all direct forces are computed
420 	sync();
421 
422 	// the rest of the WSPD can be done on the fly by the thread
423 	for_tree_partition(
424 		tree.forall_well_separated_pairs( // do a wspd traversal
425 			pair_vice_versa(m2l_function(localContext)), // M2L for a well-separated pair
426 			p2p_function(localContext), // direct evaluation
427 			p2p_function(localContext) // direct evaluation
428 		)
429 	);
430 	// wait until all local coeffs and all direct forces are computed
431 	sync();
432 
433 	// big multihreaded top down traversal. top of the tree has to be done by the main thread
434 	if (isMainThread())
435 	{
436 		tree.top_down_traversal( // top down traversal L2L pass
437 			if_then_else( tree.is_leaf_condition(), // if the node is a leaf
438 				do_nothing(), // then do nothing, we will deal with this case later
439 				l2l_function(localContext) // else shift the nodes local coeffs to the children
440 			),
441 			not_condition(tree.is_fence_condition()) // stop when the fence to the threads is reached
442 		)(tree.root()); // start at the root,
443 	}
444 	// wait for the top of the tree
445 	sync();
446 
447 	for_tree_partition( // for all roots in the threads tree partition L2L pass
448 		tree.top_down_traversal( // do a top down traversal
449 			if_then_else( tree.is_leaf_condition(), // if the node is a leaf
450 				do_nothing(), // then do nothing, we will deal with this case later
451 				l2l_function(localContext) // else shift the nodes local coeffs to the children
452 			)
453 		)
454 	);
455 	// wait until the traversal is finished and all leaves have their accumulated local coeffs
456 	sync();
457 	// evaluate all leaves and store the forces in the threads array (Note we can store them in the global array but then we have to use random access)
458 	// we can start immediately to collect the forces because we evaluated before point by point
459 	for_loop(nodePointPartition, // loop over threads points
460 		func_comp( // composition of two statements
461 			l2p_function(localContext), // evaluate the forces due to the local expansion in the corresponding leaf
462 			collect_force_function // collect the forces of all threads with the following options:
463 			<
464 				FMECollect::RepulsiveFactor | // multiply by the repulsive factor stored in the global options
465 				FMECollect::Tree2GraphOrder | // threads data is stored in quadtree leaf order, transform it into graph order
466 				FMECollect::ZeroThreadArray // reset threads array
467 			>(localContext)
468 		)
469 	);
470 }
471 
operator ()(FMEGlobalContext * globalContext)472 void FMEMultipoleKernel::operator()(FMEGlobalContext* globalContext)
473 {
474 	uint32_t					maxNumIterations    =  globalContext->pOptions->maxNumIterations;
475 	uint32_t					minNumIterations    =  globalContext->pOptions->minNumIterations;
476 	ArrayGraph&					graph				= *globalContext->pGraph;
477 	LinearQuadtree&				tree				= *globalContext->pQuadtree;
478 	LinearQuadtreeExpansion&	treeExp				= *globalContext->pExpansion;
479 	FMELocalContext*			localContext		= globalContext->pLocalContext[threadNr()];
480 	FMEGlobalOptions*			options				= globalContext->pOptions;
481 	float*						threadsForceArrayX	= localContext->forceX;
482 	float*						threadsForceArrayY	= localContext->forceY;
483 	float*						globalForceArrayX	= globalContext->globalForceX;
484 	float*						globalForceArrayY	= globalContext->globalForceY;
485 
486 	ArrayPartition edgePartition = arrayPartition(graph.numEdges());
487 	ArrayPartition nodePointPartition = arrayPartition(graph.numNodes());
488 
489 	m_pLocalContext = localContext;
490 	m_pGlobalContext = globalContext;
491 
492 	// INIT
493 
494 	//! reset the global force array
495 	for_loop_array_set(threadNr(), numThreads(), globalForceArrayX, tree.numberOfPoints(), 0.0f);
496 	for_loop_array_set(threadNr(), numThreads(), globalForceArrayY, tree.numberOfPoints(), 0.0f);
497 
498 	// reset the threads force array
499 	for (uint32_t i = 0; i < tree.numberOfPoints(); i++)
500 	{
501 		threadsForceArrayX[i] = 0.0f;
502 		threadsForceArrayY[i] = 0.0f;
503 	}
504 
505 	uint32_t maxNumIt = options->preProcMaxNumIterations;
506 	for (uint32_t currNumIteration = 0; currNumIteration < maxNumIt; currNumIteration++)
507 	{
508 		// iterate over all edges and store the resulting forces in the threads array
509 		for_loop(edgePartition,
510 			edge_force_function< static_cast<int>(FMEEdgeForce::DivDegree) > (localContext) // divide the forces by degree of the node to avoid oscilation
511 			);
512 		// wait until all edges are done
513 		sync();
514 		// now collect the forces in parallel and put the sum into the global array and move the nodes accordingly
515 		for_loop(nodePointPartition,
516 			func_comp(
517 			collect_force_function<FMECollect::EdgeFactorRep | FMECollect::ZeroThreadArray >(localContext),
518 			node_move_function<TIME_STEP_PREP | ZERO_GLOBAL_ARRAY>(localContext)
519 			)
520 			);
521 	}
522 	if (isMainThread())
523 	{
524 		globalContext->coolDown = 1.0f;
525 	}
526 	sync();
527 
528 	for (uint32_t currNumIteration = 0; currNumIteration < maxNumIterations && !globalContext->earlyExit; currNumIteration++)
529 	{
530 		// reset the coefficients
531 		for_loop_array_set(threadNr(), numThreads(), treeExp.m_multiExp, treeExp.m_numExp*(treeExp.m_numCoeff << 1), 0.0);
532 		for_loop_array_set(threadNr(), numThreads(), treeExp.m_localExp, treeExp.m_numExp*(treeExp.m_numCoeff << 1), 0.0);
533 
534 		localContext->maxForceSq = 0.0;
535 		localContext->avgForce = 0.0;
536 
537 		// construct the quadtree
538 		quadtreeConstruction(nodePointPartition);
539 		// wait for all threads to finish
540 		sync();
541 
542 		if (isSingleThreaded()) // if is single threaded run the simple approximation
543 			multipoleApproxSingleThreaded(nodePointPartition);
544 		else // otherwise use the partitioning
545 			multipoleApproxFinal(nodePointPartition);
546 		// now wait until all forces are summed up in the global array and mapped to graph node order
547 		sync();
548 
549 		// run the edge forces
550 		for_loop(edgePartition, // iterate over all edges and sum up the forces in the threads array
551 			edge_force_function< static_cast<int>(FMEEdgeForce::DivDegree) >(localContext) // divide the forces by degree of the node to avoid oscilation
552 		);
553 		// wait until edges are finished
554 		sync();
555 
556 		// collect the edge forces and move nodes without waiting
557 		for_loop(nodePointPartition,
558 			func_comp(
559 				collect_force_function<FMECollect::EdgeFactorRep | FMECollect::ZeroThreadArray>(localContext),
560 				node_move_function<TIME_STEP_NORMAL | ZERO_GLOBAL_ARRAY>(localContext)
561 			)
562 		);
563 		// wait so we can decide if we need another iteration
564 		sync();
565 		// check the max force square for all threads
566 		if (isMainThread())
567 		{
568 			double maxForceSq = 0.0;
569 			for (uint32_t j=0; j < numThreads(); j++)
570 				Math::updateMax(maxForceSq, globalContext->pLocalContext[j]->maxForceSq);
571 
572 			// if we are allowed to quit and the max force sq falls under the threshold tell all threads we are done
573 			if ((currNumIteration >= minNumIterations) && (maxForceSq < globalContext->pOptions->stopCritForce ))
574 			{
575 				globalContext->earlyExit = true;
576 			}
577 		}
578 		// this is required to wait for the earlyExit result
579 		sync();
580 	}
581 }
582 
583 //! allcates the global context and for all threads a local context
allocateContext(ArrayGraph * pGraph,FMEGlobalOptions * pOptions,uint32_t numThreads)584 FMEGlobalContext* FMEMultipoleKernel::allocateContext(ArrayGraph* pGraph, FMEGlobalOptions* pOptions, uint32_t numThreads)
585 {
586 	FMEGlobalContext* globalContext = new FMEGlobalContext();
587 
588 	globalContext->numThreads = numThreads;
589 	globalContext->pOptions = pOptions;
590 	globalContext->pGraph = pGraph;
591 	globalContext->pQuadtree = new LinearQuadtree(pGraph->numNodes(), pGraph->nodeXPos(), pGraph->nodeYPos(), pGraph->nodeSize());
592 	globalContext->pWSPD = globalContext->pQuadtree->wspd();
593 	globalContext->pExpansion = new LinearQuadtreeExpansion(globalContext->pOptions->multipolePrecision, (*globalContext->pQuadtree));
594 	uint32_t numPoints = globalContext->pQuadtree->numberOfPoints();
595 	using FMELocalContextPtr = FMELocalContext*;
596 
597 	globalContext->pLocalContext = new FMELocalContextPtr[numThreads];
598 	globalContext->globalForceX = (float*)OGDF_MALLOC_16(sizeof(float)*numPoints);
599 	globalContext->globalForceY = (float*)OGDF_MALLOC_16(sizeof(float)*numPoints);
600 	for (uint32_t i=0; i < numThreads; i++)
601 	{
602 		globalContext->pLocalContext[i] = new FMELocalContext;
603 		globalContext->pLocalContext[i]->forceX = (float*)OGDF_MALLOC_16(sizeof(float)*numPoints);
604 		globalContext->pLocalContext[i]->forceY = (float*)OGDF_MALLOC_16(sizeof(float)*numPoints);
605 		globalContext->pLocalContext[i]->pGlobalContext = globalContext;
606 	}
607 	return globalContext;
608 }
609 
610 //! frees the memory for all contexts
deallocateContext(FMEGlobalContext * globalContext)611 void FMEMultipoleKernel::deallocateContext(FMEGlobalContext* globalContext)
612 {
613 	uint32_t numThreads = globalContext->numThreads;
614 	for (uint32_t i=0; i < numThreads; i++)
615 	{
616 		OGDF_FREE_16(globalContext->pLocalContext[i]->forceX);
617 		OGDF_FREE_16(globalContext->pLocalContext[i]->forceY);
618 		delete globalContext->pLocalContext[i];
619 	}
620 	OGDF_FREE_16(globalContext->globalForceX);
621 	OGDF_FREE_16(globalContext->globalForceY);
622 	delete[] globalContext->pLocalContext;
623 	delete globalContext->pExpansion;
624 	delete globalContext->pQuadtree;
625 	delete globalContext;
626 }
627 
628 }
629 }
630