1 #ifndef _PAIRWISE_SUM_h_
2 #define _PAIRWISE_SUM_h_
3 /* PAIRWISE_SUM.h
4  *
5  * Copyright (C) 2017-2020 Paul Boersma <paul.boersma@uva.nl>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions are met:
9  *
10  * 1. Redistributions must retain the above copyright notice,
11  *    this list of conditions and the following disclaimer.
12  *
13  * 2. Redistributions in binary form must reproduce the above copyright notice,
14  *    this list of conditions and the following disclaimer in the documentation
15  *    and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
21  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
22  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
23  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
24  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  *
28  * (This is the two-clause BSD license, which is approximately identical to CC-BY.)
29  */
30 
31 /*
32 	# PAIRWISE SUMMATION FOR C AND C++
33 
34 	The pairwise summation algorithm as implemented here is approximately 1.5 times
35 	faster than the usual naive sequential summation loop,
36 	and is also hundreds of times more precise than sequential summation.
37 	Using pairwise summation instead of sequential summation will improve all your
38 	code for sums, means, inner products, norms, matrix multiplications,
39 	and CPU-based neural networks.
40 
41 	This is not necessarily a small thing in terms of CO2.
42 
43 	The macro you will call is
44 
45 		PAIRWISE_SUM (
46 			AccumulatorType,     // the type of the accumulator: long double, double, float;
47 			sumVariableName,     // the variable name of the accumulator, e.g. "sum";
48 			CounterType,         // the type of the term counters: long, int, int64_t, intptr_t;
49 			sizeExpression,      // the total number of terms to sum;
50 			initializeStatement, // the statement that declares and initializes the loop pointer(s),
51 			                     // which is/are typically a pointer or pointers to the data;
52 			termExpression       // the expression that computes a "term" (thing to sum)
53 			                     // from a piece or pieces of data.
54 			incrementStatement,  // the statement that increments the loop pointer(s)
55 			                     // before the next piece(s) of data is/are fetched;
56 		)
57 
58 	We explain this in detail below, pedagogically starting with an analogous macro
59 	for the more usual "sequential" summation algorithm.
60 
61 	## 1. SEQUENTIAL SUMMATION: A SINGLE ACCUMULATOR
62 
63 	The simplest use of summation of multiple values is if you have a single array of
64 	floating-point data to sum. If this array is x [1..size],
65 	then the usual straightforward algorithm for adding the terms is
66 
67 		long double sum = 0.0;
68 		for (long i = 1; i <= size; i ++)
69 			sum += x [i];
70 		printf ("%.17g", (double) sum);
71 
72 	This is *sequential summation*: a single *accumulator*, namely the variable `sum`,
73 	accumulates every element sequentially. Note that even though the array `x` will
74 	typically be of type `double`, the accumulator is of type `long double` in this example:
75 	using 80 bits for summing is much better than using 64 bits (1000 times more precise),
76 	and on modern computers it is usually equally fast.
77 
78 	The formulation above, which uses array indexing, is equivalent to a formulation
79 	in terms of a "looping pointer":
80 
81 		long double sum = 0.0;
82 		const double *xx = & x [1];   // the looping pointer
83 		for (long i = 1; i <= size; i ++) {
84 			sum += *xx;
85 			xx += 1;
86 		}
87 		printf ("%.17g", (double) sum);
88 
89 	The looping-pointer formulation consists of defining (declaring and initializing)
90 	the pointer before the loop, and then inside the loop
91 	first dereferencing the pointer and then incrementing it.
92 
93 	In this little algorithm we can discern the following seven ingredients:
94 	- AccumulatorType: long double
95 	- sumVariableName: "sum"
96 	- CounterType: long
97 	- sizeExpression: "size"
98 	- initializeStatement: "const double *xx = x"
99 	- termExpression: "*xx"
100 	- incrementStatement: "xx += 1"
101 
102 	The algorithm can therefore be replaced with
103 
104 		SEQUENTIAL_SUM (long double, sum, long, size, const double *xx = & x [1], *xx, xx += 1)
105 		printf ("%.17g", (double) sum);
106 
107 	where the SEQUENTIAL_SUM macro is defined as:
108 */
109 #define SEQUENTIAL_SUM(AccumulatorType, sumVariableName, CounterType, sizeExpression, \
110 	initializeStatement, termExpression, incrementStatement) \
111 \
112 	AccumulatorType sumVariableName = 0.0; \
113 	{/* scope */ \
114 		initializeStatement; \
115 		CounterType _n = sizeExpression;   /* to ensure that the size expression is evaluated only once */ \
116 		for (CounterType _i = 1; _i <= _n; _i ++) { \
117 			sumVariableName += termExpression; \
118 			incrementStatement; \
119 		} \
120 	}
121 
122 /*
123 	## 2. SEQUENTIAL SUMMATION IN MACHINE CODE
124 
125 	How would you sum the four numbers a, b, c and d,
126 	if all that your computer can do is adding two numbers at a time?
127 
128 	You could write the sum in one stroke in C, without parentheses:
129 
130 		sum = a + b + c + d;
131 
132 	Evaluation of pluses in C proceeds from left to right, so this formulation means the same as
133 
134 		sum = ((a + b) + c) + d;
135 
136 	In machine-like C, using the "registers" r1, r2, r3 and r4, you could implement this as
137 
138 		r1 = a;
139 		r2 = b;
140 		r1 += r2;
141 		r3 = c;
142 		r1 += r3;
143 		r4 = d;
144 		r1 += r4;
145 		sum = r1;
146 
147 	All these three formulations lead to identical true machine code,
148 	at least with Clang or GCC, and with optimization option -O3.
149 
150 
151 	## 3. SPEED OF SEQUENTIAL SUMMATION
152 
153 	If your processor has good prefetching, then the four loads from RAM (moving a, b, c,
154 	and d into the registers r1, r2, r3 and r4) will take up no time at all.
155 	The three additions, however, will require three clock cycles in total:
156 	as all additions are performed on r1, each addition will have to wait
157 	until the previous addition has finished. Adding up 64 numbers will require 63 cycles.
158 
159 
160 	## 4. PRECISION OF SEQUENTIAL SUMMATION
161 
162 	The worst-case floating-point rounding error for sequential summation is N * epsilon,
163 	where N is the number of numbers to add, and epsilon is the precision of one number.
164 	The expected random-walk rounding error is epsilon * sqrt (N).
165 
166 
167 	## 5. PAIRWISE SUMMATION: MULTIPLE ACCUMULATORS
168 
169 	An alternative way to add the four numbers, next to sequential summation,
170 	is to proceed in a *pairwise* manner, dividing the four numbers into two groups,
171 	then summing these separately, then summing the two results. In short C this would be:
172 
173 		sum = (a + b) + (c + d);
174 
175 	This pairwise summation continues recursively. For instance, you add 8 numbers this way:
176 
177 		sum = ((a + b) + (c + d)) + ((e + f) + (g + h));
178 
179 
180 	## 6. PAIRWISE SUMMATION IN MACHINE CODE
181 
182 	In machine-like C, using the "registers" r1, r2, r3 and r4, you can implement this as
183 
184 		r1 = a;
185 		r2 = b;
186 		r1 += r2;
187 		r3 = c;
188 		r4 = d;
189 		r3 += r4;
190 		r1 += r3;
191 		sum = r1;
192 
193 	The number of loads, stores and additions is identical to those in the sequential case.
194 
195 
196 	## 7. SPEED OF PAIRWISE SUMMATION
197 
198 	Pairwise summation can be faster than sequential summation,
199 	because the addition of r2 to r1 can be performed in parallel to the addition of r4 to r3.
200 	The three additions in section 5 could therefore be performed in two clock cycles.
201 	Indeed, both compilers that I am using in 2017 (Clang on the Mac, and GCC on Windows and Linux)
202 	take only 0.30 nanoseconds per addition, which is just over two-thirds of the clock period
203 	of the 2.3 GHz processor (which is 0.435 nanoseconds) of my late 2013 Macbook Pro.
204 	On a processor far, far away, which has more than 64 registers, perfect prefetching,
205 	and perfectly parallel operations on independent registers, 64 terms may be added
206 	within 6 clock cycles: six is the number of times we need to add something to r1,
207 	and each of these additions has to wait for the result of the previous addition.
208 	Theoretically, then, but not yet in practice, the execution time has fallen from order N
209 	to order log(N), at least for low N (and for high N it will be of order N,
210 	but with a much lower multiplication factor than sequential summation has).
211 
212 
213 	## 8. PRECISION OF PAIRWISE SUMMATION
214 
215 	The worst-case rounding error is only epsilon * log (N),
216 	and the expected random-walk rounding error is only epsilon * sqrt (log (N)).
217 	Both are much better than in the sequential summation case.
218 
219 
220 	## 9. HOW TO USE PAIRWISE SUMMATION
221 
222 	For pairwise summation we use the exact same macro arguments as for sequential summation:
223 
224 		PAIRWISE_SUM (long double, sum, long, size, const double *xx = & x [1], *xx, xx += 1)
225 		printf ("%.17g", (double) sum);
226 
227 	The macro starts by declaring a variable of type `AccumulatorType` and name `sumVariableName`.
228 	Thus, the expansion of the macro effectively starts with the declaration `long double sum;`
229 	and the variable `sum` will be available after the macro ends,
230 	so that you can use it in your subsequent code, as is done here by using `(double) sum`.
231 	This variable is used to accumulate the resulting sum and should for best results
232 	therefore be capable of accumulating partial sums with little rounding error.
233 	For this reason, you will usually want AccumulatorType to be `long double`.
234 	If long doubles are not available on your platform, or if they are slow on your platform,
235 	you can use `double` instead, or even `float`.
236 
237 	The size of the array to sum comes in in the argument `sizeExpression`, and in the expansion
238 	of the macro this number and all macro-internal counters are of type `CounterType`.
239 	In the example above I used `long` for simplicity, but you should note that on 64-bit Windows
240 	a `long` is only 32 bits, whereas the natural type for array indexing on a 64-bit machine
241 	would be `int64_t`; I therefore tend to use a CounterType of `intptr_t` myself,
242 	because this is 32 bits on 32-bit machines and 64 bits on 64-bit machines.
243 	If your `size` is an `int` (always 32 bits nowadays), you would use a CounterType of `int`;
244 	it is not beneficial to use a wider CounterType than the width of your `size`.
245 
246 	The fifth argument of the macro declares and initializes the loop pointer,
247 	as in `const double *xx = & x [1]` above. This pointer starts out pointing at the first
248 	element of the array.
249 
250 	The sixth argument of the macro is an expression that should evaluate to the value of a term,
251 	given the current value of the loop pointer(s), as in `*xx` above.
252 
253 	The seventh argument is the formula you use for incrementing the loop pointer(s),
254 	as in `xx += 1` above. The macro uses this formula to prepare for the retrieval of
255 	the next term in the summation.
256 
257 	More complicated use cases than a single array also exist. For instance, if you have to compute
258 	the inner product of the two arrays x [1..n] and y [1..n], you can do
259 
260 		PAIRWISE_SUM (long double, inner, long, n,
261 			const double *xx = & x [1];   // the semicolon ensures that this line and the next form a single argument
262 			const double *yy = & y [1],
263 			(long double) *xx * (long double) *yy,
264 			(xx += 1, yy += 1)
265 		)
266 		printf ("%.17g", (double) inner);
267 
268 	Note for the sixth argument: as 64-bit multiplication loses a lot of the precision of its
269 	two 64-bit operands, it is advisable to convert both operands to `long double`
270 	*before* the multiplication, as is done here. This usually costs no extra computation
271 	time (it can actually be faster). If you do
272 
273 		PAIRWISE_SUM (long double, inner, long, n,
274 			const double *xx = & x [1]; const double *yy = & y [1], *xx * *yy, (++ xx, ++ yy))
275 		printf ("%.17g", (double) inner);
276 
277 	instead, the conversion to `long double` is done (by the macro) *after* the multiplication,
278 	which is less precise.
279 
280 	Note for the seventh argument: you can see here that you can do the two increments simultaneously
281 	by using parentheses and a comma; fortunately, the C macro preprocessor understands enough
282 	about parentheses to see that you mean the seventh argument to be a single argument.
283 
284 	If you find the tricks with the semicolon and the parentheses too arcane, you can also just do
285 
286 		PAIRWISE_SUM (long double, inner, long, n,
287 			long i = 1,
288 			(long double) x [i] * (long double) y [i],
289 			i ++
290 		)
291 		printf ("%.17g", (double) inner);
292 
293 	and hope (i.e. test) that the compiler creates equally fast code. This method with its
294 	additional loop counter will usually be needed anyway when you want to *nest* pairwise sums.
295 
296 	Other use cases include array multiplication with strides...
297 
298 		PAIRWISE_SUM (long double, inner, long, n,
299 			const double *xx = & x [1];   // note the funny semicolon again
300 			const double *yy = & y [1],
301 			(long double) *xx * (long double) *yy,
302 			(xx += xstride, yy += ystride)
303 		)
304 		printf ("%.17g", (double) inner);
305 
306 	... and small-lag convolution...
307 
308 	for (long i = 1; i <= xSize - kernelSize + 1; i ++) {
309 		PAIRWISE_SUM (long double, conv, long, kernelSize,
310 			const double *xx = & x [i];
311 			const double *filter = & kernel [kernelSize],
312 			(long double) *xx * (long double) *filter,
313 			(xx += 1, filter -= 1)
314 		)
315 		result [i] = (double) conv;
316 	}
317 
318 	... and matrix multiplication, and computing norms.
319 
320 
321 	## 10. IMPLEMENTATION OF PAIRWISE SUMMATION: LOW POWERS OF 2
322 
323 	We have to implement pairwise summation with a macro,
324 	because the sixth and seventh arguments to PAIRWISE_SUM() have to be formulas for getting
325 	the next element to add. We use fixed formulas for adding 2, 4, 8, 16 or 32 terms,
326 	and a stack for the next 57 higher powers of 2, up to 2^62,
327 	so that our summation will work for N up to 2^63 - 1.
328 	The fixed formulas for the low powers of 2 are recursively defined macros:
329 */
330 
331 #define PAIRWISE_SUM_1_TERM(AccumulatorType, accumulator, termExpression, incrementStatement) \
332 	AccumulatorType accumulator = termExpression; \
333 	incrementStatement;
334 
335 #define PAIRWISE_SUM_2_TERMS(AccumulatorType, accumulator, termExpression, incrementStatement) \
336 	PAIRWISE_SUM_1_TERM (AccumulatorType, accumulator, termExpression, incrementStatement) \
337 	{ \
338 		PAIRWISE_SUM_1_TERM (AccumulatorType, _r2, termExpression, incrementStatement) \
339 		accumulator += _r2; \
340 	}
341 
342 #define PAIRWISE_SUM_4_TERMS(AccumulatorType, accumulator, termExpression, incrementStatement) \
343 	PAIRWISE_SUM_2_TERMS (AccumulatorType, accumulator, termExpression, incrementStatement) \
344 	{ \
345 		PAIRWISE_SUM_2_TERMS (AccumulatorType, _r3, termExpression, incrementStatement) \
346 		accumulator += _r3; \
347 	}
348 
349 #define PAIRWISE_SUM_8_TERMS(AccumulatorType, accumulator, termExpression, incrementStatement) \
350 	PAIRWISE_SUM_4_TERMS (AccumulatorType, accumulator, termExpression, incrementStatement) \
351 	{ \
352 		PAIRWISE_SUM_4_TERMS (AccumulatorType, _r4, termExpression, incrementStatement) \
353 		accumulator += _r4; \
354 	}
355 
356 #define PAIRWISE_SUM_16_TERMS(AccumulatorType, accumulator, termExpression, incrementStatement) \
357 	PAIRWISE_SUM_8_TERMS (AccumulatorType, accumulator, termExpression, incrementStatement) \
358 	{ \
359 		PAIRWISE_SUM_8_TERMS (AccumulatorType, _r5, termExpression, incrementStatement) \
360 		accumulator += _r5; \
361 	}
362 
363 #define PAIRWISE_SUM_32_TERMS(AccumulatorType, accumulator, termExpression, incrementStatement) \
364 	PAIRWISE_SUM_16_TERMS (AccumulatorType, accumulator, termExpression, incrementStatement) \
365 	{ \
366 		PAIRWISE_SUM_16_TERMS (AccumulatorType, _r6, termExpression, incrementStatement) \
367 		accumulator += _r6; \
368 	}
369 /*
370 	(The difference between the variable names "_r2", "_r3" and so on is not strictly needed,
371 	but making them the same would lead the compiler to issue warnings about shadowed variables.)
372 */
373 
374 /*
375 	## 11. IMPLEMENTATION OF PAIRWISE ADDITION: HIGH POWERS OF 2
376 
377 	Higher powers of 2 than 2^5 (= 32) go on a stack. We sum 64 values at each stroke,
378 	and this requires a fixed formula for these 64 terms:
379 */
380 #define PAIRWISE_SUM_64_TERMS(AccumulatorType, accumulator, termExpression, incrementStatement) \
381 	PAIRWISE_SUM_32_TERMS (AccumulatorType, accumulator, termExpression, incrementStatement) \
382 	{ \
383 		PAIRWISE_SUM_32_TERMS (AccumulatorType, _r7, termExpression, incrementStatement) \
384 		accumulator += _r7; \
385 	}
386 #define PAIRWISE_SUM_128_TERMS(AccumulatorType, accumulator, termExpression, incrementStatement) \
387 	PAIRWISE_SUM_64_TERMS (AccumulatorType, accumulator, termExpression, incrementStatement) \
388 	{ \
389 		PAIRWISE_SUM_64_TERMS (AccumulatorType, _r8, termExpression, incrementStatement) \
390 		accumulator += _r8; \
391 	}
392 #define PAIRWISE_SUM_256_TERMS(AccumulatorType, accumulator, termExpression, incrementStatement) \
393 	PAIRWISE_SUM_128_TERMS (AccumulatorType, accumulator, termExpression, incrementStatement) \
394 	{ \
395 		PAIRWISE_SUM_128_TERMS (AccumulatorType, _r9, termExpression, incrementStatement) \
396 		accumulator += _r9; \
397 	}
398 
399 /*
400 	A generalization about the timing of the summations in all the above macros
401 	is that r(i+1) is added to r(i) precisely when r(i+1) contains the same
402 	number of terms as r(i). This criterion for collapsing the partial sums
403 	is also used in the stack logic below.
404 */
405 
406 #define PAIRWISE_SUM(AccumulatorType, sumVariableName, CounterType, sizeExpression, \
407 	initializeStatement, termExpression, incrementStatement) \
408 \
409 	AccumulatorType sumVariableName = 0.0; \
410 	{/* scope */ \
411 		initializeStatement; \
412 		CounterType _n = sizeExpression; \
413 		if (_n & 1) { \
414 			PAIRWISE_SUM_1_TERM (AccumulatorType, _partialSum, termExpression, incrementStatement) \
415 			sumVariableName += _partialSum; \
416 		} \
417 		if (_n & 2) { \
418 			PAIRWISE_SUM_2_TERMS (AccumulatorType, _partialSum, termExpression, incrementStatement) \
419 			sumVariableName += _partialSum; \
420 		} \
421 		if (_n & 4) { \
422 			PAIRWISE_SUM_4_TERMS (AccumulatorType, _partialSum, termExpression, incrementStatement) \
423 			sumVariableName += _partialSum; \
424 		} \
425 		if (_n & 8) { \
426 			PAIRWISE_SUM_8_TERMS (AccumulatorType, _partialSum, termExpression, incrementStatement) \
427 			sumVariableName += _partialSum; \
428 		} \
429 		if (_n & 16) { \
430 			PAIRWISE_SUM_16_TERMS (AccumulatorType, _partialSum, termExpression, incrementStatement) \
431 			sumVariableName += _partialSum; \
432 		} \
433 		if (_n & 32) { \
434 			PAIRWISE_SUM_32_TERMS (AccumulatorType, _partialSum, termExpression, incrementStatement) \
435 			sumVariableName += _partialSum; \
436 		} \
437 		const int _baseCasePower = 6;   /* because the base case is 64 = 2^6 terms */ \
438 		CounterType _numberOfBaseCases = _n >> _baseCasePower; \
439 		if (_numberOfBaseCases != 0) { \
440 			/*                                                                                  */ \
441 			/*  The value of _powers [0] stays at 0, to denote the bottom of the stack.         */ \
442 			/*  The maximum value of _powers [1] should be 62, which denotes that 2^62 terms    */ \
443 			/*  have been accumulated into _partialSumStack [1]. This the maximum,              */ \
444 			/*  because _n can be at most 2^63-1 (assuming CounterType is 64 bits).             */ \
445 			/*  The maximum value of _powers [2] should then be 61.                             */ \
446 			/*  The maximum value of _powers [3] should be 60.                                  */ \
447 			/*  ...                                                                             */ \
448 			/*  The maximum value of _powers [57] should be 6. It ends there, because           */ \
449 			/*  2^6 is the granularity with which base case sums are put on the stack.          */ \
450 			/*  The maximum value of _powers [58] should also be 6,                             */ \
451 			/*  because this can be the situation just before collapsing the top of the stack.  */ \
452 			/*  However, if the whole stack is filled up like this, the actual number of        */ \
453 			/*  terms is already 2^63 (i.e. 2^62 + 2^61 + 2^60 + ... 2^6 + 2^6). Therefore, we  */ \
454 			/*  need one element less, so the highest index of _powers [] should be 57.         */ \
455 			/*  For 32-bit counters, this highest index is 25.                                  */ \
456 			/*                                                                                  */ \
457 			const int _numberOfBitsInCounterType = 8 * sizeof (CounterType);   /* 64 or 32 */ \
458 			const int _highestIndex = _numberOfBitsInCounterType - 1 - _baseCasePower; \
459 			AccumulatorType _partialSumStack [1 + _highestIndex];   /* 8 bytes too many, but better code */ \
460 			unsigned char _powers [1 + _highestIndex]; \
461 			_powers [0] = 0; \
462 			int _stackPointer = 0; \
463 			for (CounterType _ipart = 1; _ipart <= _numberOfBaseCases; _ipart ++) { \
464 				/*                                                                              */ \
465 				/*  Compute the sum of the next 64 data points.                                 */ \
466 				/*                                                                              */ \
467 				PAIRWISE_SUM_64_TERMS (AccumulatorType, _partialSum, termExpression, incrementStatement) \
468 				/*                                                                              */ \
469 				/*  Put this sum on top of the stack.                                           */ \
470 				/*                                                                              */ \
471 				_partialSumStack [++ _stackPointer] = _partialSum; \
472 				_powers [_stackPointer] = _baseCasePower; \
473 				/*                                                                              */ \
474 				/*  The collapse criterion:                                                     */ \
475 				/*                                                                              */ \
476 				while (_powers [_stackPointer] == _powers [_stackPointer - 1]) { \
477 					_partialSumStack [_stackPointer - 1] += _partialSumStack [_stackPointer]; \
478 					_powers [-- _stackPointer] += 1; \
479 				} \
480 			} \
481 			/*                                                                                  */ \
482 			/*  Add all the elements of the stack, starting at the top.                         */ \
483 			/*                                                                                  */ \
484 			while (_stackPointer > 0) { \
485 				sumVariableName += _partialSumStack [_stackPointer --]; \
486 			} \
487 		} \
488 	}
489 /*
490 	Note that we don't do the usual trick with `do {...} while (0)` that would allow you to add
491 	a semicolon after the `PAIRWISE_SUM()` call. This is to prevent the suggestion that the macro
492 	constitutes a single statement. The macro contains a sequence of two things: the definition
493 	of `sumVariableName` and a long block that usually changes the value of `sumVariableName`.
494 	Hence, the macro cannot be used as a single statement
495 	and e.g. has to be bracketed if used in an `else` clause.
496 	You are therefore advised to call `PAIRWISE_SUM()` without appending the misleading semicolon.
497 */
498 
499 /*
500 	## 12. SOME LESS GOOD SUMMATION ALGORITHMS
501 
502 	The summation algorithm (at least for computing the mean) in the statistics software R
503 	is two-loop summation. This is fairly precise, but very slow:
504 */
505 #define TWO_LOOP_SUM(AccumulatorType, sumVariableName, CounterType, sizeExpression, \
506 	initializeStatement, termExpression, incrementStatement) \
507 \
508 	AccumulatorType sumVariableName = 0.0; \
509 	{/* scope */ \
510 		CounterType _n = sizeExpression; \
511 		{/* scope */ \
512 			initializeStatement; \
513 			for (CounterType _i = 1; _i <= _n; _i ++) { \
514 				sumVariableName += termExpression; \
515 				incrementStatement; \
516 			} \
517 		} \
518 		AccumulatorType _mean = sumVariableName / _n; \
519 		{/* scope */ \
520 			sumVariableName = 0.0; \
521 			initializeStatement; \
522 			for (CounterType _i = 1; _i <= _n; _i ++) { \
523 				sumVariableName += (termExpression) - _mean; \
524 				incrementStatement; \
525 			} \
526 			sumVariableName += _mean * _n; \
527 		} \
528 	}
529 /*
530 	Another one is the Kahan algorithm. Its precision is similar to that of pairwise summation,
531 	but it is extremely slow:
532 */
533 #define KAHAN_SUM(AccumulatorType, sumVariableName, CounterType, sizeExpression, \
534 	initializeStatement, termExpression, incrementStatement) \
535 \
536 	AccumulatorType sumVariableName = 0.0; \
537 	{/* scope */ \
538 		initializeStatement; \
539 		CounterType _n = sizeExpression; \
540 		AccumulatorType _correction = 0.0; \
541 		for (CounterType _i = 1; _i <= _n; _i ++) { \
542 			AccumulatorType _correctedTerm = (termExpression) - _correction; \
543 			AccumulatorType _newSum = sumVariableName + _correctedTerm; \
544 			_correction = (_newSum - sumVariableName) - _correctedTerm; \
545 			sumVariableName = _newSum; \
546 			incrementStatement; \
547 		} \
548 	}
549 
550 /* End of file PAIRWISE_SUM.h */
551 #endif
552