1 /* PSPP - a program for statistical analysis.
2    Copyright (C) 1997-9, 2000, 2010, 2011 Free Software Foundation, Inc.
3 
4    This program is free software: you can redistribute it and/or modify
5    it under the terms of the GNU General Public License as published by
6    the Free Software Foundation, either version 3 of the License, or
7    (at your option) any later version.
8 
9    This program is distributed in the hope that it will be useful,
10    but WITHOUT ANY WARRANTY; without even the implied warranty of
11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12    GNU General Public License for more details.
13 
14    You should have received a copy of the GNU General Public License
15    along with this program.  If not, see <http://www.gnu.org/licenses/>. */
16 
17 #include <config.h>
18 
19 #include "math/moments.h"
20 
21 #include <assert.h>
22 #include <math.h>
23 #include <stdlib.h>
24 
25 #include "data/val-type.h"
26 #include "data/value.h"
27 #include "libpspp/misc.h"
28 
29 #include "gl/xalloc.h"
30 
31 #include "gettext.h"
32 #define _(msgid) gettext (msgid)
33 
34 /* Calculates variance, skewness, and kurtosis into *VARIANCE,
35    *SKEWNESS, and *KURTOSIS if they are non-null and not greater
36    moments than MAX_MOMENT.  Accepts W as the total weight, D1 as
37    the total deviation from the estimated mean, and D2, D3, and
38    D4 as the sum of the squares, cubes, and 4th powers,
39    respectively, of the deviation from the estimated mean. */
40 static void
calc_moments(enum moment max_moment,double w,double d1,double d2,double d3,double d4,double * variance,double * skewness,double * kurtosis)41 calc_moments (enum moment max_moment,
42               double w, double d1, double d2, double d3, double d4,
43               double *variance, double *skewness, double *kurtosis)
44 {
45   assert (w > 0.);
46 
47   if (max_moment >= MOMENT_VARIANCE && w > 1.)
48     {
49       double s2 = (d2 - pow2 (d1) / w) / (w - 1.);
50       if (variance != NULL)
51         *variance = s2;
52 
53       /* From _SPSS Statistical Algorithms, 2nd ed.,
54          0-918469-89-9, section "DESCRIPTIVES". */
55       if (fabs (s2) >= 1e-20)
56         {
57           if (max_moment >= MOMENT_SKEWNESS && skewness != NULL && w > 2.)
58             {
59               double s3 = s2 * sqrt (s2);
60               double g1 = (w * d3) / ((w - 1.0) * (w - 2.0) * s3);
61               if (isfinite (g1))
62                 *skewness = g1;
63             }
64           if (max_moment >= MOMENT_KURTOSIS && kurtosis != NULL && w > 3.)
65             {
66               double den = (w - 2.) * (w - 3.) * pow2 (s2);
67               double g2 = (w * (w + 1) * d4 / (w - 1.) / den
68                            - 3. * pow2 (d2) / den);
69               if (isfinite (g2))
70                 *kurtosis = g2;
71             }
72         }
73     }
74 }
75 
76 /* Two-pass moments. */
77 
78 /* A set of two-pass moments. */
79 struct moments
80   {
81     enum moment max_moment;     /* Highest-order moment we're computing. */
82     int pass;                   /* Current pass (1 or 2). */
83 
84     /* Pass one. */
85     double w1;                  /* Total weight for pass 1, so far. */
86     double sum;                 /* Sum of values so far. */
87     double mean;                /* Mean = sum / w1. */
88 
89     /* Pass two. */
90     double w2;                  /* Total weight for pass 2, so far. */
91     double d1;                  /* Sum of deviations from the mean. */
92     double d2;                  /* Sum of squared deviations from the mean. */
93     double d3;                  /* Sum of cubed deviations from the mean. */
94     double d4;                  /* Sum of (deviations from the mean)**4. */
95   };
96 
97 /* Initializes moments M for calculating moment MAX_MOMENT and
98    lower moments. */
99 static void
init_moments(struct moments * m,enum moment max_moment)100 init_moments (struct moments *m, enum moment max_moment)
101 {
102   assert (m != NULL);
103   assert (max_moment == MOMENT_MEAN || max_moment == MOMENT_VARIANCE
104           || max_moment == MOMENT_SKEWNESS || max_moment == MOMENT_KURTOSIS);
105   m->max_moment = max_moment;
106   moments_clear (m);
107 }
108 
109 /* Clears out a set of moments so that it can be reused for a new
110    set of values.  The moments to be calculated are not changed. */
111 void
moments_clear(struct moments * m)112 moments_clear (struct moments *m)
113 {
114   m->pass = 1;
115   m->w1 = m->w2 = 0.;
116   m->sum = 0.;
117 }
118 
119 /* Creates and returns a data structure for calculating moment
120    MAX_MOMENT and lower moments on a data series.  The user
121    should call moments_pass_one() for each value in the series,
122    then call moments_pass_two() for the same set of values in the
123    same order, then call moments_calculate() to obtain the
124    moments.  The user may ask for the mean at any time during the
125    first pass (using moments_calculate()), but otherwise no
126    statistics may be requested until the end of the second pass.
127    Call moments_destroy() when the moments are no longer
128    needed. */
129 struct moments *
moments_create(enum moment max_moment)130 moments_create (enum moment max_moment)
131 {
132   struct moments *m = xmalloc (sizeof *m);
133   init_moments (m, max_moment);
134   return m;
135 }
136 
137 /* Adds VALUE with the given WEIGHT to the calculation of
138    moments for the first pass. */
139 void
moments_pass_one(struct moments * m,double value,double weight)140 moments_pass_one (struct moments *m, double value, double weight)
141 {
142   assert (m != NULL);
143   assert (m->pass == 1);
144 
145   if (value != SYSMIS && weight > 0.)
146     {
147       m->sum += value * weight;
148       m->w1 += weight;
149     }
150 }
151 
152 /* Adds VALUE with the given WEIGHT to the calculation of
153    moments for the second pass. */
154 void
moments_pass_two(struct moments * m,double value,double weight)155 moments_pass_two (struct moments *m, double value, double weight)
156 {
157   assert (m != NULL);
158 
159   if (m->pass == 1)
160     {
161       m->pass = 2;
162       m->mean = m->w1 != 0. ? m->sum / m->w1 : 0.;
163       m->d1 = m->d2 = m->d3 = m->d4 = 0.;
164     }
165 
166   if (value != SYSMIS && weight >= 0.)
167     {
168       double d = value - m->mean;
169       double d1_delta = d * weight;
170       m->d1 += d1_delta;
171       if (m->max_moment >= MOMENT_VARIANCE)
172         {
173           double d2_delta = d1_delta * d;
174           m->d2 += d2_delta;
175           if (m->max_moment >= MOMENT_SKEWNESS)
176             {
177               double d3_delta = d2_delta * d;
178               m->d3 += d3_delta;
179               if (m->max_moment >= MOMENT_KURTOSIS)
180                 {
181                   double d4_delta = d3_delta * d;
182                   m->d4 += d4_delta;
183                 }
184             }
185         }
186       m->w2 += weight;
187     }
188 }
189 
190 /* Calculates moments based on the input data.  Stores the total
191    weight in *WEIGHT, the mean in *MEAN, the variance in
192    *VARIANCE, the skewness in *SKEWNESS, and the kurtosis in
193    *KURTOSIS.  Any of these result parameters may be null
194    pointers, in which case the values are not calculated.  If any
195    result cannot be calculated, either because they are undefined
196    based on the input data or because their moments are higher
197    than the maximum requested on moments_create(), then SYSMIS is
198    stored into that result. */
199 void
moments_calculate(const struct moments * m,double * weight,double * mean,double * variance,double * skewness,double * kurtosis)200 moments_calculate (const struct moments *m,
201                    double *weight,
202                    double *mean, double *variance,
203                    double *skewness, double *kurtosis)
204 {
205   assert (m != NULL);
206 
207   if (mean != NULL)
208     *mean = SYSMIS;
209   if (variance != NULL)
210     *variance = SYSMIS;
211   if (skewness != NULL)
212     *skewness = SYSMIS;
213   if (kurtosis != NULL)
214     *kurtosis = SYSMIS;
215 
216   if (weight != NULL)
217     *weight = m->w1;
218 
219   /* How many passes so far? */
220   if (m->pass == 1)
221     {
222       /* In the first pass we can only calculate the mean. */
223       if (mean != NULL && m->w1 > 0.)
224         *mean = m->sum / m->w1;
225     }
226   else
227     {
228       /* After the second pass we can calculate any stat.  */
229       assert (m->pass == 2);
230 
231       if (m->w2 > 0.)
232         {
233           if (mean != NULL)
234             *mean = m->mean;
235           calc_moments (m->max_moment,
236                         m->w2, m->d1, m->d2, m->d3, m->d4,
237                         variance, skewness, kurtosis);
238         }
239     }
240 }
241 
242 /* Destroys a set of moments. */
243 void
moments_destroy(struct moments * m)244 moments_destroy (struct moments *m)
245 {
246   free (m);
247 }
248 
249 /* Calculates the requested moments on the CNT values in ARRAY.
250    Each value is given a weight of 1.  The total weight is stored
251    into *WEIGHT (trivially) and the mean, variance, skewness, and
252    kurtosis are stored into *MEAN, *VARIANCE, *SKEWNESS, and
253    *KURTOSIS, respectively.  Any of the result pointers may be
254    null, in which case no value is stored. */
255 void
moments_of_doubles(const double * array,size_t cnt,double * weight,double * mean,double * variance,double * skewness,double * kurtosis)256 moments_of_doubles (const double *array, size_t cnt,
257                     double *weight,
258                     double *mean, double *variance,
259                     double *skewness, double *kurtosis)
260 {
261   enum moment max_moment;
262   struct moments m;
263   size_t idx;
264 
265   if (kurtosis != NULL)
266     max_moment = MOMENT_KURTOSIS;
267   else if (skewness != NULL)
268     max_moment = MOMENT_SKEWNESS;
269   else if (variance != NULL)
270     max_moment = MOMENT_VARIANCE;
271   else
272     max_moment = MOMENT_MEAN;
273 
274   init_moments (&m, max_moment);
275   for (idx = 0; idx < cnt; idx++)
276     moments_pass_one (&m, array[idx], 1.);
277   for (idx = 0; idx < cnt; idx++)
278     moments_pass_two (&m, array[idx], 1.);
279   moments_calculate (&m, weight, mean, variance, skewness, kurtosis);
280 }
281 
282 /* Calculates the requested moments on the CNT numeric values in
283    ARRAY.  Each value is given a weight of 1.  The total weight
284    is stored into *WEIGHT (trivially) and the mean, variance,
285    skewness, and kurtosis are stored into *MEAN, *VARIANCE,
286    *SKEWNESS, and *KURTOSIS, respectively.  Any of the result
287    pointers may be null, in which case no value is stored. */
288 void
moments_of_values(const union value * array,size_t cnt,double * weight,double * mean,double * variance,double * skewness,double * kurtosis)289 moments_of_values (const union value *array, size_t cnt,
290                    double *weight,
291                    double *mean, double *variance,
292                    double *skewness, double *kurtosis)
293 {
294   enum moment max_moment;
295   struct moments m;
296   size_t idx;
297 
298   if (kurtosis != NULL)
299     max_moment = MOMENT_KURTOSIS;
300   else if (skewness != NULL)
301     max_moment = MOMENT_SKEWNESS;
302   else if (variance != NULL)
303     max_moment = MOMENT_VARIANCE;
304   else
305     max_moment = MOMENT_MEAN;
306 
307   init_moments (&m, max_moment);
308   for (idx = 0; idx < cnt; idx++)
309     moments_pass_one (&m, array[idx].f, 1.);
310   for (idx = 0; idx < cnt; idx++)
311     moments_pass_two (&m, array[idx].f, 1.);
312   moments_calculate (&m, weight, mean, variance, skewness, kurtosis);
313 }
314 
315 /* One-pass moments. */
316 
317 /* A set of one-pass moments. */
318 struct moments1
319   {
320     enum moment max_moment;     /* Highest-order moment we're computing. */
321     double w;                   /* Total weight so far. */
322     double d1;                  /* Sum of deviations from the mean. */
323     double d2;                  /* Sum of squared deviations from the mean. */
324     double d3;                  /* Sum of cubed deviations from the mean. */
325     double d4;                  /* Sum of (deviations from the mean)**4. */
326   };
327 
328 /* Initializes one-pass moments M for calculating moment
329    MAX_MOMENT and lower moments. */
330 static void
init_moments1(struct moments1 * m,enum moment max_moment)331 init_moments1 (struct moments1 *m, enum moment max_moment)
332 {
333   assert (m != NULL);
334   assert (max_moment == MOMENT_MEAN || max_moment == MOMENT_VARIANCE
335           || max_moment == MOMENT_SKEWNESS || max_moment == MOMENT_KURTOSIS);
336   m->max_moment = max_moment;
337   moments1_clear (m);
338 }
339 
340 /* Clears out a set of one-pass moments so that it can be reused
341    for a new set of values.  The moments to be calculated are not
342    changed. */
343 void
moments1_clear(struct moments1 * m)344 moments1_clear (struct moments1 *m)
345 {
346   m->w = 0.;
347   m->d1 = m->d2 = m->d3 = m->d4 = 0.;
348 }
349 
350 /* Creates and returns a data structure for calculating moment
351    MAX_MOMENT and lower moments on a data series in a single
352    pass.  The user should call moments1_add() for each value in
353    the series.  The user may call moments1_calculate() to obtain
354    the current moments at any time.  Call moments1_destroy() when
355    the moments are no longer needed.
356 
357    One-pass moments should only be used when two passes over the
358    data are impractical. */
359 struct moments1 *
moments1_create(enum moment max_moment)360 moments1_create (enum moment max_moment)
361 {
362   struct moments1 *m = xmalloc (sizeof *m);
363   init_moments1 (m, max_moment);
364   return m;
365 }
366 
367 /* Adds VALUE with the given WEIGHT to the calculation of
368    one-pass moments. */
369 void
moments1_add(struct moments1 * m,double value,double weight)370 moments1_add (struct moments1 *m, double value, double weight)
371 {
372   assert (m != NULL);
373 
374   if (value != SYSMIS && weight > 0.)
375     {
376       double prev_w, v1;
377 
378       prev_w = m->w;
379       m->w += weight;
380       v1 = (weight / m->w) * (value - m->d1);
381       m->d1 += v1;
382 
383       if (m->max_moment >= MOMENT_VARIANCE)
384         {
385           double v2 = v1 * v1;
386           double w_prev_w = m->w * prev_w;
387           double prev_m2 = m->d2;
388 
389           m->d2 += w_prev_w / weight * v2;
390           if (m->max_moment >= MOMENT_SKEWNESS)
391             {
392               double w2 = weight * weight;
393               double v3 = v2 * v1;
394               double prev_m3 = m->d3;
395 
396               m->d3 += (-3. * v1 * prev_m2
397                          + w_prev_w / w2 * (m->w - 2. * weight) * v3);
398               if (m->max_moment >= MOMENT_KURTOSIS)
399                 {
400                   double w3 = w2 * weight;
401                   double v4 = v2 * v2;
402 
403                   m->d4 += (-4. * v1 * prev_m3
404                              + 6. * v2 * prev_m2
405                              + ((pow2 (m->w) - 3. * weight * prev_w)
406                                 * v4 * w_prev_w / w3));
407                 }
408             }
409         }
410     }
411 }
412 
413 /* Calculates one-pass moments based on the input data.  Stores
414    the total weight in *WEIGHT, the mean in *MEAN, the variance
415    in *VARIANCE, the skewness in *SKEWNESS, and the kurtosis in
416    *KURTOSIS.  Any of these result parameters may be null
417    pointers, in which case the values are not calculated.  If any
418    result cannot be calculated, either because they are undefined
419    based on the input data or because their moments are higher
420    than the maximum requested on moments_create(), then SYSMIS is
421    stored into that result. */
422 void
moments1_calculate(const struct moments1 * m,double * weight,double * mean,double * variance,double * skewness,double * kurtosis)423 moments1_calculate (const struct moments1 *m,
424                     double *weight,
425                     double *mean, double *variance,
426                     double *skewness, double *kurtosis)
427 {
428   assert (m != NULL);
429 
430   if (mean != NULL)
431     *mean = SYSMIS;
432   if (variance != NULL)
433     *variance = SYSMIS;
434   if (skewness != NULL)
435     *skewness = SYSMIS;
436   if (kurtosis != NULL)
437     *kurtosis = SYSMIS;
438 
439   if (weight != NULL)
440     *weight = m->w;
441 
442   if (m->w > 0.)
443     {
444       if (mean != NULL)
445         *mean = m->d1;
446 
447       calc_moments (m->max_moment,
448                     m->w, 0., m->d2, m->d3, m->d4,
449                     variance, skewness, kurtosis);
450     }
451 }
452 
453 /* Destroy one-pass moments M. */
454 void
moments1_destroy(struct moments1 * m)455 moments1_destroy (struct moments1 *m)
456 {
457   free (m);
458 }
459 
460 
461 double
calc_semean(double var,double W)462 calc_semean (double var, double W)
463 {
464   return sqrt (var / W);
465 }
466 
467 /* Returns the standard error of the skewness for the given total
468    weight W.
469 
470    From _SPSS Statistical Algorithms, 2nd ed., 0-918469-89-9,
471    section "DESCRIPTIVES". */
472 double
calc_seskew(double W)473 calc_seskew (double W)
474 {
475   return sqrt ((6. * W * (W - 1.)) / ((W - 2.) * (W + 1.) * (W + 3.)));
476 }
477 
478 /* Returns the standard error of the kurtosis for the given total
479    weight W.
480 
481    From _SPSS Statistical Algorithms, 2nd ed., 0-918469-89-9,
482    section "DESCRIPTIVES", except that the sqrt symbol is omitted
483    there. */
484 double
calc_sekurt(double W)485 calc_sekurt (double W)
486 {
487   return sqrt ((4. * (pow2 (W) - 1.) * pow2 (calc_seskew (W)))
488                / ((W - 3.) * (W + 5.)));
489 }
490