1 #ifndef _PAIRWISE_SUM_h_ 2 #define _PAIRWISE_SUM_h_ 3 /* PAIRWISE_SUM.h 4 * 5 * Copyright (C) 2017-2020 Paul Boersma <paul.boersma@uva.nl> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions are met: 9 * 10 * 1. Redistributions must retain the above copyright notice, 11 * this list of conditions and the following disclaimer. 12 * 13 * 2. Redistributions in binary form must reproduce the above copyright notice, 14 * this list of conditions and the following disclaimer in the documentation 15 * and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 19 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 20 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR 21 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 22 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 23 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 24 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 26 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 * 28 * (This is the two-clause BSD license, which is approximately identical to CC-BY.) 29 */ 30 31 /* 32 # PAIRWISE SUMMATION FOR C AND C++ 33 34 The pairwise summation algorithm as implemented here is approximately 1.5 times 35 faster than the usual naive sequential summation loop, 36 and is also hundreds of times more precise than sequential summation. 37 Using pairwise summation instead of sequential summation will improve all your 38 code for sums, means, inner products, norms, matrix multiplications, 39 and CPU-based neural networks. 40 41 This is not necessarily a small thing in terms of CO2. 42 43 The macro you will call is 44 45 PAIRWISE_SUM ( 46 AccumulatorType, // the type of the accumulator: long double, double, float; 47 sumVariableName, // the variable name of the accumulator, e.g. "sum"; 48 CounterType, // the type of the term counters: long, int, int64_t, intptr_t; 49 sizeExpression, // the total number of terms to sum; 50 initializeStatement, // the statement that declares and initializes the loop pointer(s), 51 // which is/are typically a pointer or pointers to the data; 52 termExpression // the expression that computes a "term" (thing to sum) 53 // from a piece or pieces of data. 54 incrementStatement, // the statement that increments the loop pointer(s) 55 // before the next piece(s) of data is/are fetched; 56 ) 57 58 We explain this in detail below, pedagogically starting with an analogous macro 59 for the more usual "sequential" summation algorithm. 60 61 ## 1. SEQUENTIAL SUMMATION: A SINGLE ACCUMULATOR 62 63 The simplest use of summation of multiple values is if you have a single array of 64 floating-point data to sum. If this array is x [1..size], 65 then the usual straightforward algorithm for adding the terms is 66 67 long double sum = 0.0; 68 for (long i = 1; i <= size; i ++) 69 sum += x [i]; 70 printf ("%.17g", (double) sum); 71 72 This is *sequential summation*: a single *accumulator*, namely the variable `sum`, 73 accumulates every element sequentially. Note that even though the array `x` will 74 typically be of type `double`, the accumulator is of type `long double` in this example: 75 using 80 bits for summing is much better than using 64 bits (1000 times more precise), 76 and on modern computers it is usually equally fast. 77 78 The formulation above, which uses array indexing, is equivalent to a formulation 79 in terms of a "looping pointer": 80 81 long double sum = 0.0; 82 const double *xx = & x [1]; // the looping pointer 83 for (long i = 1; i <= size; i ++) { 84 sum += *xx; 85 xx += 1; 86 } 87 printf ("%.17g", (double) sum); 88 89 The looping-pointer formulation consists of defining (declaring and initializing) 90 the pointer before the loop, and then inside the loop 91 first dereferencing the pointer and then incrementing it. 92 93 In this little algorithm we can discern the following seven ingredients: 94 - AccumulatorType: long double 95 - sumVariableName: "sum" 96 - CounterType: long 97 - sizeExpression: "size" 98 - initializeStatement: "const double *xx = x" 99 - termExpression: "*xx" 100 - incrementStatement: "xx += 1" 101 102 The algorithm can therefore be replaced with 103 104 SEQUENTIAL_SUM (long double, sum, long, size, const double *xx = & x [1], *xx, xx += 1) 105 printf ("%.17g", (double) sum); 106 107 where the SEQUENTIAL_SUM macro is defined as: 108 */ 109 #define SEQUENTIAL_SUM(AccumulatorType, sumVariableName, CounterType, sizeExpression, \ 110 initializeStatement, termExpression, incrementStatement) \ 111 \ 112 AccumulatorType sumVariableName = 0.0; \ 113 {/* scope */ \ 114 initializeStatement; \ 115 CounterType _n = sizeExpression; /* to ensure that the size expression is evaluated only once */ \ 116 for (CounterType _i = 1; _i <= _n; _i ++) { \ 117 sumVariableName += termExpression; \ 118 incrementStatement; \ 119 } \ 120 } 121 122 /* 123 ## 2. SEQUENTIAL SUMMATION IN MACHINE CODE 124 125 How would you sum the four numbers a, b, c and d, 126 if all that your computer can do is adding two numbers at a time? 127 128 You could write the sum in one stroke in C, without parentheses: 129 130 sum = a + b + c + d; 131 132 Evaluation of pluses in C proceeds from left to right, so this formulation means the same as 133 134 sum = ((a + b) + c) + d; 135 136 In machine-like C, using the "registers" r1, r2, r3 and r4, you could implement this as 137 138 r1 = a; 139 r2 = b; 140 r1 += r2; 141 r3 = c; 142 r1 += r3; 143 r4 = d; 144 r1 += r4; 145 sum = r1; 146 147 All these three formulations lead to identical true machine code, 148 at least with Clang or GCC, and with optimization option -O3. 149 150 151 ## 3. SPEED OF SEQUENTIAL SUMMATION 152 153 If your processor has good prefetching, then the four loads from RAM (moving a, b, c, 154 and d into the registers r1, r2, r3 and r4) will take up no time at all. 155 The three additions, however, will require three clock cycles in total: 156 as all additions are performed on r1, each addition will have to wait 157 until the previous addition has finished. Adding up 64 numbers will require 63 cycles. 158 159 160 ## 4. PRECISION OF SEQUENTIAL SUMMATION 161 162 The worst-case floating-point rounding error for sequential summation is N * epsilon, 163 where N is the number of numbers to add, and epsilon is the precision of one number. 164 The expected random-walk rounding error is epsilon * sqrt (N). 165 166 167 ## 5. PAIRWISE SUMMATION: MULTIPLE ACCUMULATORS 168 169 An alternative way to add the four numbers, next to sequential summation, 170 is to proceed in a *pairwise* manner, dividing the four numbers into two groups, 171 then summing these separately, then summing the two results. In short C this would be: 172 173 sum = (a + b) + (c + d); 174 175 This pairwise summation continues recursively. For instance, you add 8 numbers this way: 176 177 sum = ((a + b) + (c + d)) + ((e + f) + (g + h)); 178 179 180 ## 6. PAIRWISE SUMMATION IN MACHINE CODE 181 182 In machine-like C, using the "registers" r1, r2, r3 and r4, you can implement this as 183 184 r1 = a; 185 r2 = b; 186 r1 += r2; 187 r3 = c; 188 r4 = d; 189 r3 += r4; 190 r1 += r3; 191 sum = r1; 192 193 The number of loads, stores and additions is identical to those in the sequential case. 194 195 196 ## 7. SPEED OF PAIRWISE SUMMATION 197 198 Pairwise summation can be faster than sequential summation, 199 because the addition of r2 to r1 can be performed in parallel to the addition of r4 to r3. 200 The three additions in section 5 could therefore be performed in two clock cycles. 201 Indeed, both compilers that I am using in 2017 (Clang on the Mac, and GCC on Windows and Linux) 202 take only 0.30 nanoseconds per addition, which is just over two-thirds of the clock period 203 of the 2.3 GHz processor (which is 0.435 nanoseconds) of my late 2013 Macbook Pro. 204 On a processor far, far away, which has more than 64 registers, perfect prefetching, 205 and perfectly parallel operations on independent registers, 64 terms may be added 206 within 6 clock cycles: six is the number of times we need to add something to r1, 207 and each of these additions has to wait for the result of the previous addition. 208 Theoretically, then, but not yet in practice, the execution time has fallen from order N 209 to order log(N), at least for low N (and for high N it will be of order N, 210 but with a much lower multiplication factor than sequential summation has). 211 212 213 ## 8. PRECISION OF PAIRWISE SUMMATION 214 215 The worst-case rounding error is only epsilon * log (N), 216 and the expected random-walk rounding error is only epsilon * sqrt (log (N)). 217 Both are much better than in the sequential summation case. 218 219 220 ## 9. HOW TO USE PAIRWISE SUMMATION 221 222 For pairwise summation we use the exact same macro arguments as for sequential summation: 223 224 PAIRWISE_SUM (long double, sum, long, size, const double *xx = & x [1], *xx, xx += 1) 225 printf ("%.17g", (double) sum); 226 227 The macro starts by declaring a variable of type `AccumulatorType` and name `sumVariableName`. 228 Thus, the expansion of the macro effectively starts with the declaration `long double sum;` 229 and the variable `sum` will be available after the macro ends, 230 so that you can use it in your subsequent code, as is done here by using `(double) sum`. 231 This variable is used to accumulate the resulting sum and should for best results 232 therefore be capable of accumulating partial sums with little rounding error. 233 For this reason, you will usually want AccumulatorType to be `long double`. 234 If long doubles are not available on your platform, or if they are slow on your platform, 235 you can use `double` instead, or even `float`. 236 237 The size of the array to sum comes in in the argument `sizeExpression`, and in the expansion 238 of the macro this number and all macro-internal counters are of type `CounterType`. 239 In the example above I used `long` for simplicity, but you should note that on 64-bit Windows 240 a `long` is only 32 bits, whereas the natural type for array indexing on a 64-bit machine 241 would be `int64_t`; I therefore tend to use a CounterType of `intptr_t` myself, 242 because this is 32 bits on 32-bit machines and 64 bits on 64-bit machines. 243 If your `size` is an `int` (always 32 bits nowadays), you would use a CounterType of `int`; 244 it is not beneficial to use a wider CounterType than the width of your `size`. 245 246 The fifth argument of the macro declares and initializes the loop pointer, 247 as in `const double *xx = & x [1]` above. This pointer starts out pointing at the first 248 element of the array. 249 250 The sixth argument of the macro is an expression that should evaluate to the value of a term, 251 given the current value of the loop pointer(s), as in `*xx` above. 252 253 The seventh argument is the formula you use for incrementing the loop pointer(s), 254 as in `xx += 1` above. The macro uses this formula to prepare for the retrieval of 255 the next term in the summation. 256 257 More complicated use cases than a single array also exist. For instance, if you have to compute 258 the inner product of the two arrays x [1..n] and y [1..n], you can do 259 260 PAIRWISE_SUM (long double, inner, long, n, 261 const double *xx = & x [1]; // the semicolon ensures that this line and the next form a single argument 262 const double *yy = & y [1], 263 (long double) *xx * (long double) *yy, 264 (xx += 1, yy += 1) 265 ) 266 printf ("%.17g", (double) inner); 267 268 Note for the sixth argument: as 64-bit multiplication loses a lot of the precision of its 269 two 64-bit operands, it is advisable to convert both operands to `long double` 270 *before* the multiplication, as is done here. This usually costs no extra computation 271 time (it can actually be faster). If you do 272 273 PAIRWISE_SUM (long double, inner, long, n, 274 const double *xx = & x [1]; const double *yy = & y [1], *xx * *yy, (++ xx, ++ yy)) 275 printf ("%.17g", (double) inner); 276 277 instead, the conversion to `long double` is done (by the macro) *after* the multiplication, 278 which is less precise. 279 280 Note for the seventh argument: you can see here that you can do the two increments simultaneously 281 by using parentheses and a comma; fortunately, the C macro preprocessor understands enough 282 about parentheses to see that you mean the seventh argument to be a single argument. 283 284 If you find the tricks with the semicolon and the parentheses too arcane, you can also just do 285 286 PAIRWISE_SUM (long double, inner, long, n, 287 long i = 1, 288 (long double) x [i] * (long double) y [i], 289 i ++ 290 ) 291 printf ("%.17g", (double) inner); 292 293 and hope (i.e. test) that the compiler creates equally fast code. This method with its 294 additional loop counter will usually be needed anyway when you want to *nest* pairwise sums. 295 296 Other use cases include array multiplication with strides... 297 298 PAIRWISE_SUM (long double, inner, long, n, 299 const double *xx = & x [1]; // note the funny semicolon again 300 const double *yy = & y [1], 301 (long double) *xx * (long double) *yy, 302 (xx += xstride, yy += ystride) 303 ) 304 printf ("%.17g", (double) inner); 305 306 ... and small-lag convolution... 307 308 for (long i = 1; i <= xSize - kernelSize + 1; i ++) { 309 PAIRWISE_SUM (long double, conv, long, kernelSize, 310 const double *xx = & x [i]; 311 const double *filter = & kernel [kernelSize], 312 (long double) *xx * (long double) *filter, 313 (xx += 1, filter -= 1) 314 ) 315 result [i] = (double) conv; 316 } 317 318 ... and matrix multiplication, and computing norms. 319 320 321 ## 10. IMPLEMENTATION OF PAIRWISE SUMMATION: LOW POWERS OF 2 322 323 We have to implement pairwise summation with a macro, 324 because the sixth and seventh arguments to PAIRWISE_SUM() have to be formulas for getting 325 the next element to add. We use fixed formulas for adding 2, 4, 8, 16 or 32 terms, 326 and a stack for the next 57 higher powers of 2, up to 2^62, 327 so that our summation will work for N up to 2^63 - 1. 328 The fixed formulas for the low powers of 2 are recursively defined macros: 329 */ 330 331 #define PAIRWISE_SUM_1_TERM(AccumulatorType, accumulator, termExpression, incrementStatement) \ 332 AccumulatorType accumulator = termExpression; \ 333 incrementStatement; 334 335 #define PAIRWISE_SUM_2_TERMS(AccumulatorType, accumulator, termExpression, incrementStatement) \ 336 PAIRWISE_SUM_1_TERM (AccumulatorType, accumulator, termExpression, incrementStatement) \ 337 { \ 338 PAIRWISE_SUM_1_TERM (AccumulatorType, _r2, termExpression, incrementStatement) \ 339 accumulator += _r2; \ 340 } 341 342 #define PAIRWISE_SUM_4_TERMS(AccumulatorType, accumulator, termExpression, incrementStatement) \ 343 PAIRWISE_SUM_2_TERMS (AccumulatorType, accumulator, termExpression, incrementStatement) \ 344 { \ 345 PAIRWISE_SUM_2_TERMS (AccumulatorType, _r3, termExpression, incrementStatement) \ 346 accumulator += _r3; \ 347 } 348 349 #define PAIRWISE_SUM_8_TERMS(AccumulatorType, accumulator, termExpression, incrementStatement) \ 350 PAIRWISE_SUM_4_TERMS (AccumulatorType, accumulator, termExpression, incrementStatement) \ 351 { \ 352 PAIRWISE_SUM_4_TERMS (AccumulatorType, _r4, termExpression, incrementStatement) \ 353 accumulator += _r4; \ 354 } 355 356 #define PAIRWISE_SUM_16_TERMS(AccumulatorType, accumulator, termExpression, incrementStatement) \ 357 PAIRWISE_SUM_8_TERMS (AccumulatorType, accumulator, termExpression, incrementStatement) \ 358 { \ 359 PAIRWISE_SUM_8_TERMS (AccumulatorType, _r5, termExpression, incrementStatement) \ 360 accumulator += _r5; \ 361 } 362 363 #define PAIRWISE_SUM_32_TERMS(AccumulatorType, accumulator, termExpression, incrementStatement) \ 364 PAIRWISE_SUM_16_TERMS (AccumulatorType, accumulator, termExpression, incrementStatement) \ 365 { \ 366 PAIRWISE_SUM_16_TERMS (AccumulatorType, _r6, termExpression, incrementStatement) \ 367 accumulator += _r6; \ 368 } 369 /* 370 (The difference between the variable names "_r2", "_r3" and so on is not strictly needed, 371 but making them the same would lead the compiler to issue warnings about shadowed variables.) 372 */ 373 374 /* 375 ## 11. IMPLEMENTATION OF PAIRWISE ADDITION: HIGH POWERS OF 2 376 377 Higher powers of 2 than 2^5 (= 32) go on a stack. We sum 64 values at each stroke, 378 and this requires a fixed formula for these 64 terms: 379 */ 380 #define PAIRWISE_SUM_64_TERMS(AccumulatorType, accumulator, termExpression, incrementStatement) \ 381 PAIRWISE_SUM_32_TERMS (AccumulatorType, accumulator, termExpression, incrementStatement) \ 382 { \ 383 PAIRWISE_SUM_32_TERMS (AccumulatorType, _r7, termExpression, incrementStatement) \ 384 accumulator += _r7; \ 385 } 386 #define PAIRWISE_SUM_128_TERMS(AccumulatorType, accumulator, termExpression, incrementStatement) \ 387 PAIRWISE_SUM_64_TERMS (AccumulatorType, accumulator, termExpression, incrementStatement) \ 388 { \ 389 PAIRWISE_SUM_64_TERMS (AccumulatorType, _r8, termExpression, incrementStatement) \ 390 accumulator += _r8; \ 391 } 392 #define PAIRWISE_SUM_256_TERMS(AccumulatorType, accumulator, termExpression, incrementStatement) \ 393 PAIRWISE_SUM_128_TERMS (AccumulatorType, accumulator, termExpression, incrementStatement) \ 394 { \ 395 PAIRWISE_SUM_128_TERMS (AccumulatorType, _r9, termExpression, incrementStatement) \ 396 accumulator += _r9; \ 397 } 398 399 /* 400 A generalization about the timing of the summations in all the above macros 401 is that r(i+1) is added to r(i) precisely when r(i+1) contains the same 402 number of terms as r(i). This criterion for collapsing the partial sums 403 is also used in the stack logic below. 404 */ 405 406 #define PAIRWISE_SUM(AccumulatorType, sumVariableName, CounterType, sizeExpression, \ 407 initializeStatement, termExpression, incrementStatement) \ 408 \ 409 AccumulatorType sumVariableName = 0.0; \ 410 {/* scope */ \ 411 initializeStatement; \ 412 CounterType _n = sizeExpression; \ 413 if (_n & 1) { \ 414 PAIRWISE_SUM_1_TERM (AccumulatorType, _partialSum, termExpression, incrementStatement) \ 415 sumVariableName += _partialSum; \ 416 } \ 417 if (_n & 2) { \ 418 PAIRWISE_SUM_2_TERMS (AccumulatorType, _partialSum, termExpression, incrementStatement) \ 419 sumVariableName += _partialSum; \ 420 } \ 421 if (_n & 4) { \ 422 PAIRWISE_SUM_4_TERMS (AccumulatorType, _partialSum, termExpression, incrementStatement) \ 423 sumVariableName += _partialSum; \ 424 } \ 425 if (_n & 8) { \ 426 PAIRWISE_SUM_8_TERMS (AccumulatorType, _partialSum, termExpression, incrementStatement) \ 427 sumVariableName += _partialSum; \ 428 } \ 429 if (_n & 16) { \ 430 PAIRWISE_SUM_16_TERMS (AccumulatorType, _partialSum, termExpression, incrementStatement) \ 431 sumVariableName += _partialSum; \ 432 } \ 433 if (_n & 32) { \ 434 PAIRWISE_SUM_32_TERMS (AccumulatorType, _partialSum, termExpression, incrementStatement) \ 435 sumVariableName += _partialSum; \ 436 } \ 437 const int _baseCasePower = 6; /* because the base case is 64 = 2^6 terms */ \ 438 CounterType _numberOfBaseCases = _n >> _baseCasePower; \ 439 if (_numberOfBaseCases != 0) { \ 440 /* */ \ 441 /* The value of _powers [0] stays at 0, to denote the bottom of the stack. */ \ 442 /* The maximum value of _powers [1] should be 62, which denotes that 2^62 terms */ \ 443 /* have been accumulated into _partialSumStack [1]. This the maximum, */ \ 444 /* because _n can be at most 2^63-1 (assuming CounterType is 64 bits). */ \ 445 /* The maximum value of _powers [2] should then be 61. */ \ 446 /* The maximum value of _powers [3] should be 60. */ \ 447 /* ... */ \ 448 /* The maximum value of _powers [57] should be 6. It ends there, because */ \ 449 /* 2^6 is the granularity with which base case sums are put on the stack. */ \ 450 /* The maximum value of _powers [58] should also be 6, */ \ 451 /* because this can be the situation just before collapsing the top of the stack. */ \ 452 /* However, if the whole stack is filled up like this, the actual number of */ \ 453 /* terms is already 2^63 (i.e. 2^62 + 2^61 + 2^60 + ... 2^6 + 2^6). Therefore, we */ \ 454 /* need one element less, so the highest index of _powers [] should be 57. */ \ 455 /* For 32-bit counters, this highest index is 25. */ \ 456 /* */ \ 457 const int _numberOfBitsInCounterType = 8 * sizeof (CounterType); /* 64 or 32 */ \ 458 const int _highestIndex = _numberOfBitsInCounterType - 1 - _baseCasePower; \ 459 AccumulatorType _partialSumStack [1 + _highestIndex]; /* 8 bytes too many, but better code */ \ 460 unsigned char _powers [1 + _highestIndex]; \ 461 _powers [0] = 0; \ 462 int _stackPointer = 0; \ 463 for (CounterType _ipart = 1; _ipart <= _numberOfBaseCases; _ipart ++) { \ 464 /* */ \ 465 /* Compute the sum of the next 64 data points. */ \ 466 /* */ \ 467 PAIRWISE_SUM_64_TERMS (AccumulatorType, _partialSum, termExpression, incrementStatement) \ 468 /* */ \ 469 /* Put this sum on top of the stack. */ \ 470 /* */ \ 471 _partialSumStack [++ _stackPointer] = _partialSum; \ 472 _powers [_stackPointer] = _baseCasePower; \ 473 /* */ \ 474 /* The collapse criterion: */ \ 475 /* */ \ 476 while (_powers [_stackPointer] == _powers [_stackPointer - 1]) { \ 477 _partialSumStack [_stackPointer - 1] += _partialSumStack [_stackPointer]; \ 478 _powers [-- _stackPointer] += 1; \ 479 } \ 480 } \ 481 /* */ \ 482 /* Add all the elements of the stack, starting at the top. */ \ 483 /* */ \ 484 while (_stackPointer > 0) { \ 485 sumVariableName += _partialSumStack [_stackPointer --]; \ 486 } \ 487 } \ 488 } 489 /* 490 Note that we don't do the usual trick with `do {...} while (0)` that would allow you to add 491 a semicolon after the `PAIRWISE_SUM()` call. This is to prevent the suggestion that the macro 492 constitutes a single statement. The macro contains a sequence of two things: the definition 493 of `sumVariableName` and a long block that usually changes the value of `sumVariableName`. 494 Hence, the macro cannot be used as a single statement 495 and e.g. has to be bracketed if used in an `else` clause. 496 You are therefore advised to call `PAIRWISE_SUM()` without appending the misleading semicolon. 497 */ 498 499 /* 500 ## 12. SOME LESS GOOD SUMMATION ALGORITHMS 501 502 The summation algorithm (at least for computing the mean) in the statistics software R 503 is two-loop summation. This is fairly precise, but very slow: 504 */ 505 #define TWO_LOOP_SUM(AccumulatorType, sumVariableName, CounterType, sizeExpression, \ 506 initializeStatement, termExpression, incrementStatement) \ 507 \ 508 AccumulatorType sumVariableName = 0.0; \ 509 {/* scope */ \ 510 CounterType _n = sizeExpression; \ 511 {/* scope */ \ 512 initializeStatement; \ 513 for (CounterType _i = 1; _i <= _n; _i ++) { \ 514 sumVariableName += termExpression; \ 515 incrementStatement; \ 516 } \ 517 } \ 518 AccumulatorType _mean = sumVariableName / _n; \ 519 {/* scope */ \ 520 sumVariableName = 0.0; \ 521 initializeStatement; \ 522 for (CounterType _i = 1; _i <= _n; _i ++) { \ 523 sumVariableName += (termExpression) - _mean; \ 524 incrementStatement; \ 525 } \ 526 sumVariableName += _mean * _n; \ 527 } \ 528 } 529 /* 530 Another one is the Kahan algorithm. Its precision is similar to that of pairwise summation, 531 but it is extremely slow: 532 */ 533 #define KAHAN_SUM(AccumulatorType, sumVariableName, CounterType, sizeExpression, \ 534 initializeStatement, termExpression, incrementStatement) \ 535 \ 536 AccumulatorType sumVariableName = 0.0; \ 537 {/* scope */ \ 538 initializeStatement; \ 539 CounterType _n = sizeExpression; \ 540 AccumulatorType _correction = 0.0; \ 541 for (CounterType _i = 1; _i <= _n; _i ++) { \ 542 AccumulatorType _correctedTerm = (termExpression) - _correction; \ 543 AccumulatorType _newSum = sumVariableName + _correctedTerm; \ 544 _correction = (_newSum - sumVariableName) - _correctedTerm; \ 545 sumVariableName = _newSum; \ 546 incrementStatement; \ 547 } \ 548 } 549 550 /* End of file PAIRWISE_SUM.h */ 551 #endif 552