1 /***************************************************************************
2 File : ShapiroWilkTest.cpp
3 Project : QtiPlot
4 --------------------------------------------------------------------
5 Copyright : (C) 2010 by Ion Vasilief
6 Email (use @ for *) : ion_vasilief*yahoo.fr
7 Description : Normality test. The code was taken from R (see file swilk.c)
8 and adapted to C++.
9 ***************************************************************************/
10
11 /***************************************************************************
12 * *
13 * This program is free software; you can redistribute it and/or modify *
14 * it under the terms of the GNU General Public License as published by *
15 * the Free Software Foundation; either version 2 of the License, or *
16 * (at your option) any later version. *
17 * *
18 * This program is distributed in the hope that it will be useful, *
19 * but WITHOUT ANY WARRANTY; without even the implied warranty of *
20 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
21 * GNU General Public License for more details. *
22 * *
23 * You should have received a copy of the GNU General Public License *
24 * along with this program; if not, write to the Free Software *
25 * Foundation, Inc., 51 Franklin Street, Fifth Floor, *
26 * Boston, MA 02110-1301 USA *
27 * *
28 ***************************************************************************/
29
30 #include <ShapiroWilkTest.h>
31 #include <gsl/gsl_randist.h>
32 #include <gsl/gsl_cdf.h>
33 #include <gsl/gsl_sort.h>
34
35 #ifndef min
36 # define min(a, b) ((a) > (b) ? (b) : (a))
37 #endif
38
39 #ifndef sign
40 # define sign(a) ((a) >= 0 ? 1.0 : -1.0)
41 #endif
42
ShapiroWilkTest(ApplicationWindow * parent,const QString & sample)43 ShapiroWilkTest::ShapiroWilkTest(ApplicationWindow *parent, const QString& sample)
44 : StatisticTest(parent, 0.0, 0.05, sample),
45 d_w(0.0),
46 d_pValue(0.0)
47 {
48 if (d_n < 3 || d_n > 5000){
49 QMessageBox::critical(parent, QObject::tr("Attention!"), QObject::tr("Sample size must be between 3 and 5000."));
50 this->freeMemory();
51 } else {
52 setObjectName(QObject::tr("Shapiro-Wilk Normality Test"));
53 int init = false;
54 int n = d_n;
55 int n1 = d_n;
56 int n2 = d_n/2;
57 int error = 0;
58 double a[n2];
59 gsl_sort(d_data, 1, d_n);// the data must be sorted first
60 swilk(&init, d_data, &n, &n1, &n2, a, &d_w, &d_pValue, &error);
61 }
62 }
63
logInfo()64 QString ShapiroWilkTest::logInfo()
65 {
66 QString s = "\n[" + QDateTime::currentDateTime().toString(Qt::LocalDate)+ " \"" + d_table->objectName() + "\"]\t";
67 s += QObject::tr("Normality Test (Shapiro - Wilk)") + "\n\n";
68 return s + infoString();
69 }
70
shortLogInfo()71 QString ShapiroWilkTest::shortLogInfo()
72 {
73 return infoString(false);
74 }
75
infoString(bool header)76 QString ShapiroWilkTest::infoString(bool header)
77 {
78 ApplicationWindow *app = (ApplicationWindow *)parent();
79 QLocale l = app->locale();
80 int p = app->d_decimal_digits;
81
82 QStringList lst;
83 lst << QObject::tr("Dataset");
84 lst << QObject::tr("N");
85 lst << QObject::tr("W");
86 lst << QObject::tr("P Value");
87 lst << d_col_name;
88 lst << QString::number(d_n);
89 lst << l.toString(d_w, 'g', p);
90 lst << l.toString(d_pValue, 'g', p);
91
92 QFontMetrics fm(app->font());
93 int width = 0;
94 foreach(QString aux, lst){
95 int aw = fm.width(aux);
96 if (aw > width)
97 width = aw;
98 }
99 width += 6;
100
101 QString head;
102 for (int i = 0; i < 4; i++){
103 QString aux = lst[i];
104 int spaces = ceil((double)(width - fm.width(aux))/(double)fm.width(QLatin1Char(' '))) + 1;
105 head += aux + QString(spaces, QLatin1Char(' '));
106 }
107
108 head += QObject::tr("Decision");
109
110 QString s;
111 if (header)
112 s = head;
113
114 QString val;
115 for (int i = 4; i < lst.size(); i++){
116 QString aux = lst[i];
117 int spaces = ceil((double)(width - fm.width(aux))/(double)fm.width(QLatin1Char(' '))) + 1;
118 val += aux + QString(spaces, QLatin1Char(' '));
119 }
120
121 if (d_pValue >= d_significance_level)
122 val += QObject::tr("Normal at %1 level").arg(l.toString(d_significance_level));
123 else
124 val += QObject::tr("Not normal at %1 level").arg(l.toString(d_significance_level));
125
126 if (header){
127 int scores = ceil((double)fm.width(val)/(double)fm.width(QLatin1Char('-')));
128 s +="\n" + QString(scores, QLatin1Char('-')) + "\n";
129 }
130
131 return s + val + "\n";
132 }
133
swilk(int * init,double * x,int * n,int * n1,int * n2,double * a,double * w,double * pw,int * ifault)134 void ShapiroWilkTest::swilk(int *init,/* logical: is a[] already initialized ? */
135 double *x, int *n, int *n1, int *n2,
136 double *a,/* coefficients a[] */
137 double *w, double *pw, int *ifault)
138 {
139
140 /* ALGORITHM AS R94 APPL. STATIST. (1995) vol.44, no.4, 547-551.
141
142 Calculates the Shapiro-Wilk W test and its significance level
143 */
144
145 /* Initialized data */
146 const static double zero = 0.f;
147 const static double one = 1.f;
148 const static double two = 2.f;
149
150 const static double small = 1e-19f;
151
152 /* polynomial coefficients */
153 const static double g[2] = { -2.273f,.459f };
154 const static double
155 c1[6] = { 0.f,.221157f,-.147981f,-2.07119f, 4.434685f, -2.706056f },
156 c2[6] = { 0.f,.042981f,-.293762f,-1.752461f,5.682633f, -3.582633f };
157 const static double c3[4] = { .544f,-.39978f,.025054f,-6.714e-4f };
158 const static double c4[4] = { 1.3822f,-.77857f,.062767f,-.0020322f };
159 const static double c5[4] = { -1.5861f,-.31082f,-.083751f,.0038915f };
160 const static double c6[3] = { -.4803f,-.082676f,.0030302f };
161 const static double c7[2] = { .164f,.533f };
162 const static double c8[2] = { .1736f,.315f };
163 const static double c9[2] = { .256f,-.00635f };
164
165 /* System generated locals */
166 double r__1;
167
168 /*
169 Auxiliary routines : poly() {below}
170 */
171 /* Local variables */
172 int i, j, ncens, i1, nn2;
173
174 double zbar, ssassx, summ2, ssumm2, gamma, delta, range;
175 double a1, a2, an, bf, ld, m, s, sa, xi, sx, xx, y, w1;
176 double fac, asa, an25, ssa, z90f, sax, zfm, z95f, zsd, z99f, rsn, ssx, xsx;
177
178 *pw = 1.;
179 if (*w >= 0.) {
180 *w = 1.;
181 }
182 if (*n < 3) { *ifault = 1; return;
183 }
184
185 an = (double) (*n);
186 nn2 = *n / 2;
187 if (*n2 < nn2) { *ifault = 3; return;
188 }
189 if (*n1 < 3) { *ifault = 1; return;
190 }
191 ncens = *n - *n1;
192 if (ncens < 0 || (ncens > 0 && *n < 20)) { *ifault = 4; return;
193 }
194 if (ncens > 0) {
195 delta = (double) ncens / an;
196 if (delta > .8f) { *ifault = 5; return;
197 }
198 } /* just for -Wall:*/ else { delta = 0.f; }
199
200 --a; /* so we can keep using 1-based indices */
201
202 /* If INIT is false (always when called from R),
203 * calculate coefficients a[] for the test statistic W */
204 if (! (*init)) {
205 if (*n == 3) {
206 const static double sqrth = .70710678f;/* = sqrt(1/2), was .70711f */
207 a[1] = sqrth;
208 } else {
209 an25 = an + .25;
210 summ2 = zero;
211 for (i = 1; i <= *n2; ++i) {
212 a[i] = gsl_cdf_ugaussian_Pinv ((i - .375f) / an25);
213 r__1 = a[i];
214 summ2 += r__1 * r__1;
215 }
216 summ2 *= two;
217 ssumm2 = sqrt(summ2);
218 rsn = one / sqrt(an);
219 a1 = poly(c1, 6, rsn) - a[1] / ssumm2;
220
221 /* Normalize a[] */
222 if (*n > 5) {
223 i1 = 3;
224 a2 = -a[2] / ssumm2 + poly(c2, 6, rsn);
225 fac = sqrt((summ2 - two * (a[1] * a[1]) - two * (a[2] * a[2]))
226 / (one - two * (a1 * a1) - two * (a2 * a2)));
227 a[2] = a2;
228 } else {
229 i1 = 2;
230 fac = sqrt((summ2 - two * (a[1] * a[1])) /
231 ( one - two * (a1 * a1)));
232 }
233 a[1] = a1;
234 for (i = i1; i <= nn2; ++i)
235 a[i] /= - fac;
236 }
237 *init = (1);
238 }
239
240 /* If W is input as negative, calculate significance level of -W */
241
242 if (*w < zero) {
243 w1 = 1. + *w;
244 *ifault = 0;
245 goto L70;
246 }
247
248 /* Check for zero range */
249
250 range = x[*n1 - 1] - x[0];
251 if (range < small) {
252 *ifault = 6; return;
253 }
254
255 /* Check for correct sort order on range - scaled X */
256
257 /* *ifault = 7; <-- a no-op, since it is changed below, in ANY CASE! */
258 *ifault = 0;
259 xx = x[0] / range;
260 sx = xx;
261 sa = -a[1];
262 j = *n - 1;
263 for (i = 1; i < *n1; --j) {
264 xi = x[i] / range;
265 if (xx - xi > small) {
266 /* Fortran had: print *, "ANYTHING"
267 * but do NOT; it *does* happen with sorted x (on Intel GNU/linux 32bit):
268 * shapiro.test(c(-1.7, -1,-1,-.73,-.61,-.5,-.24, .45,.62,.81,1))
269 */
270 *ifault = 7;
271 }
272 sx += xi;
273 ++i;
274 if (i != j)
275 sa += sign(i - j) * a[min(i,j)];
276 xx = xi;
277 }
278 if (*n > 5000) {
279 *ifault = 2;
280 }
281
282 /* Calculate W statistic as squared correlation
283 between data and coefficients */
284
285 sa /= *n1;
286 sx /= *n1;
287 ssa = ssx = sax = zero;
288 j = *n - 1;
289 for (i = 0; i < *n1; ++i, --j) {
290 if (i != j)
291 asa = sign(i - j) * a[1+min(i,j)] - sa;
292 else
293 asa = -sa;
294 xsx = x[i] / range - sx;
295 ssa += asa * asa;
296 ssx += xsx * xsx;
297 sax += asa * xsx;
298 }
299
300 /* W1 equals (1-W) calculated to avoid excessive rounding error
301 for W very near 1 (a potential problem in very large samples) */
302
303 ssassx = sqrt(ssa * ssx);
304 w1 = (ssassx - sax) * (ssassx + sax) / (ssa * ssx);
305 L70:
306 *w = 1. - w1;
307
308 /* Calculate significance level for W */
309
310 if (*n == 3) {/* exact P value : */
311 const static double pi6 = 1.90985931710274;/* = 6/pi, was 1.909859f */
312 const static double stqr= 1.04719755119660;/* = asin(sqrt(3/4)), was 1.047198f */
313 *pw = pi6 * (asin(sqrt(*w)) - stqr);
314 if(*pw < 0.) *pw = 0.;
315 return;
316 }
317 y = log(w1);
318 xx = log(an);
319 if (*n <= 11) {
320 gamma = poly(g, 2, an);
321 if (y >= gamma) {
322 *pw = 1e-99;/* an "obvious" value, was 'small' which was 1e-19f */
323 return;
324 }
325 y = -log(gamma - y);
326 m = poly(c3, 4, an);
327 s = exp(poly(c4, 4, an));
328 } else {/* n >= 12 */
329 m = poly(c5, 4, xx);
330 s = exp(poly(c6, 3, xx));
331 }
332 /*DBG printf("c(w1=%g, w=%g, y=%g, m=%g, s=%g)\n",w1,*w,y,m,s); */
333
334 if (ncens > 0) {/* <==> n > n1 --- not happening currently when called from R */
335
336 /* Censoring by proportion NCENS/N.
337 Calculate mean and sd of normal equivalent deviate of W. */
338
339 const static double three = 3.f;
340
341 const static double z90 = 1.2816f;
342 const static double z95 = 1.6449f;
343 const static double z99 = 2.3263f;
344 const static double zm = 1.7509f;
345 const static double zss = .56268f;
346 const static double bf1 = .8378f;
347
348 const static double xx90 = .556;
349 const static double xx95 = .622;
350
351 ld = -log(delta);
352 bf = one + xx * bf1;
353 r__1 = pow(xx90, (double) xx);
354 z90f = z90 + bf * pow(poly(c7, 2, r__1), (double) ld);
355 r__1 = pow(xx95, (double) xx);
356 z95f = z95 + bf * pow(poly(c8, 2, r__1), (double) ld);
357 z99f = z99 + bf * pow(poly(c9, 2, xx), (double)ld);
358
359 /* Regress Z90F,...,Z99F on normal deviates Z90,...,Z99 to get
360 pseudo-mean and pseudo-sd of z as the slope and intercept */
361
362 zfm = (z90f + z95f + z99f) / three;
363 zsd = (z90 * (z90f - zfm) +
364 z95 * (z95f - zfm) + z99 * (z99f - zfm)) / zss;
365 zbar = zfm - zsd * zm;
366 m += zbar * s;
367 s *= zsd;
368 }
369 //*pw = pnorm((double) y, (double)m, (double)s, 0/* upper tail */, 0);
370 *pw = gsl_cdf_gaussian_Q(y - m, s);
371 /* = alnorm_(dble((Y - M)/S), 1); */
372
373 return;
374 } /* swilk */
375
poly(const double * cc,int nord,double x)376 double ShapiroWilkTest::poly(const double *cc, int nord, double x)
377 {
378 /* Algorithm AS 181.2 Appl. Statist. (1982) Vol. 31, No. 2
379
380 Calculates the algebraic polynomial of order nord-1 with
381 array of coefficients cc. Zero order coefficient is cc(1) = cc[0]
382 */
383 /* Local variables */
384 int j;
385 double p, ret_val;/* preserve precision! */
386
387 ret_val = cc[0];
388 if (nord > 1) {
389 p = x * cc[nord-1];
390 for (j = nord - 2; j > 0; j--)
391 p = (p + cc[j]) * x;
392
393 ret_val += p;
394 }
395 return ret_val;
396 } /* poly */
397