1 /***************************************************************************
2 	File                 : ShapiroWilkTest.cpp
3 	Project              : QtiPlot
4 	--------------------------------------------------------------------
5 	Copyright            : (C) 2010 by Ion Vasilief
6 	Email (use @ for *)  : ion_vasilief*yahoo.fr
7 	Description          : Normality test. The code was taken from R (see file swilk.c)
8 						   and adapted to C++.
9  ***************************************************************************/
10 
11 /***************************************************************************
12  *                                                                         *
13  *  This program is free software; you can redistribute it and/or modify   *
14  *  it under the terms of the GNU General Public License as published by   *
15  *  the Free Software Foundation; either version 2 of the License, or      *
16  *  (at your option) any later version.                                    *
17  *                                                                         *
18  *  This program is distributed in the hope that it will be useful,        *
19  *  but WITHOUT ANY WARRANTY; without even the implied warranty of         *
20  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the          *
21  *  GNU General Public License for more details.                           *
22  *                                                                         *
23  *   You should have received a copy of the GNU General Public License     *
24  *   along with this program; if not, write to the Free Software           *
25  *   Foundation, Inc., 51 Franklin Street, Fifth Floor,                    *
26  *   Boston, MA  02110-1301  USA                                           *
27  *                                                                         *
28  ***************************************************************************/
29 
30 #include <ShapiroWilkTest.h>
31 #include <gsl/gsl_randist.h>
32 #include <gsl/gsl_cdf.h>
33 #include <gsl/gsl_sort.h>
34 
35 #ifndef min
36 # define min(a, b)		((a) > (b) ? (b) : (a))
37 #endif
38 
39 #ifndef sign
40 # define sign(a)		((a) >= 0 ? 1.0 : -1.0)
41 #endif
42 
ShapiroWilkTest(ApplicationWindow * parent,const QString & sample)43 ShapiroWilkTest::ShapiroWilkTest(ApplicationWindow *parent, const QString& sample)
44 : StatisticTest(parent, 0.0, 0.05, sample),
45 d_w(0.0),
46 d_pValue(0.0)
47 {
48 	if (d_n < 3 || d_n > 5000){
49 		QMessageBox::critical(parent, QObject::tr("Attention!"), QObject::tr("Sample size must be between 3 and 5000."));
50 		this->freeMemory();
51 	} else {
52 		setObjectName(QObject::tr("Shapiro-Wilk Normality Test"));
53 		int init = false;
54 		int n = d_n;
55 		int n1 = d_n;
56 		int n2 = d_n/2;
57 		int error = 0;
58 		double a[n2];
59 		gsl_sort(d_data, 1, d_n);// the data must be sorted first
60 		swilk(&init, d_data, &n, &n1, &n2, a, &d_w, &d_pValue, &error);
61 	}
62 }
63 
logInfo()64 QString ShapiroWilkTest::logInfo()
65 {
66 	QString s = "\n[" + QDateTime::currentDateTime().toString(Qt::LocalDate)+ " \"" + d_table->objectName() + "\"]\t";
67 	s += QObject::tr("Normality Test (Shapiro - Wilk)") + "\n\n";
68 	return s + infoString();
69 }
70 
shortLogInfo()71 QString ShapiroWilkTest::shortLogInfo()
72 {
73 	return infoString(false);
74 }
75 
infoString(bool header)76 QString ShapiroWilkTest::infoString(bool header)
77 {
78 	ApplicationWindow *app = (ApplicationWindow *)parent();
79 	QLocale l = app->locale();
80 	int p = app->d_decimal_digits;
81 
82 	QStringList lst;
83 	lst << QObject::tr("Dataset");
84 	lst << QObject::tr("N");
85 	lst << QObject::tr("W");
86 	lst << QObject::tr("P Value");
87 	lst << d_col_name;
88 	lst << QString::number(d_n);
89 	lst << l.toString(d_w, 'g', p);
90 	lst << l.toString(d_pValue, 'g', p);
91 
92 	QFontMetrics fm(app->font());
93 	int width = 0;
94 	foreach(QString aux, lst){
95 		int aw = fm.width(aux);
96 		if (aw > width)
97 			width = aw;
98 	}
99 	width += 6;
100 
101 	QString head;
102 	for (int i = 0; i < 4; i++){
103 		QString aux = lst[i];
104 		int spaces = ceil((double)(width - fm.width(aux))/(double)fm.width(QLatin1Char(' '))) + 1;
105 		head += aux + QString(spaces, QLatin1Char(' '));
106 	}
107 
108 	head += QObject::tr("Decision");
109 
110 	QString s;
111 	if (header)
112 		s = head;
113 
114 	QString val;
115 	for (int i = 4; i < lst.size(); i++){
116 		QString aux = lst[i];
117 		int spaces = ceil((double)(width - fm.width(aux))/(double)fm.width(QLatin1Char(' '))) + 1;
118 		val += aux + QString(spaces, QLatin1Char(' '));
119 	}
120 
121 	if (d_pValue >= d_significance_level)
122 		val += QObject::tr("Normal at %1 level").arg(l.toString(d_significance_level));
123 	else
124 		val += QObject::tr("Not normal at %1 level").arg(l.toString(d_significance_level));
125 
126 	if (header){
127 		int scores = ceil((double)fm.width(val)/(double)fm.width(QLatin1Char('-')));
128 		s +="\n" + QString(scores, QLatin1Char('-')) + "\n";
129 	}
130 
131 	return s + val + "\n";
132 }
133 
swilk(int * init,double * x,int * n,int * n1,int * n2,double * a,double * w,double * pw,int * ifault)134 void ShapiroWilkTest::swilk(int *init,/* logical: is a[] already initialized ? */
135 	  double *x, int *n, int *n1, int *n2,
136 	  double *a,/* coefficients a[] */
137       double *w, double *pw, int *ifault)
138 {
139 
140 /*	ALGORITHM AS R94 APPL. STATIST. (1995) vol.44, no.4, 547-551.
141 
142 	Calculates the Shapiro-Wilk W test and its significance level
143 */
144 
145     /* Initialized data */
146 	const static double zero = 0.f;
147 	const static double one = 1.f;
148 	const static double two = 2.f;
149 
150 	const static double small = 1e-19f;
151 
152     /* polynomial coefficients */
153 	const static double g[2] = { -2.273f,.459f };
154 	const static double
155       c1[6] = { 0.f,.221157f,-.147981f,-2.07119f, 4.434685f, -2.706056f },
156       c2[6] = { 0.f,.042981f,-.293762f,-1.752461f,5.682633f, -3.582633f };
157 	const static double c3[4] = { .544f,-.39978f,.025054f,-6.714e-4f };
158 	const static double c4[4] = { 1.3822f,-.77857f,.062767f,-.0020322f };
159 	const static double c5[4] = { -1.5861f,-.31082f,-.083751f,.0038915f };
160 	const static double c6[3] = { -.4803f,-.082676f,.0030302f };
161 	const static double c7[2] = { .164f,.533f };
162 	const static double c8[2] = { .1736f,.315f };
163 	const static double c9[2] = { .256f,-.00635f };
164 
165     /* System generated locals */
166 	double r__1;
167 
168 /*
169 	Auxiliary routines : poly()  {below}
170 */
171     /* Local variables */
172     int i, j, ncens, i1, nn2;
173 
174 	double zbar, ssassx, summ2, ssumm2, gamma, delta, range;
175 	double a1, a2, an, bf, ld, m, s, sa, xi, sx, xx, y, w1;
176 	double fac, asa, an25, ssa, z90f, sax, zfm, z95f, zsd, z99f, rsn, ssx, xsx;
177 
178     *pw = 1.;
179     if (*w >= 0.) {
180 	*w = 1.;
181     }
182     if (*n < 3) {	*ifault = 1; return;
183     }
184 
185 	an = (double) (*n);
186     nn2 = *n / 2;
187     if (*n2 < nn2) {	*ifault = 3; return;
188     }
189     if (*n1 < 3) {	*ifault = 1; return;
190     }
191     ncens = *n - *n1;
192     if (ncens < 0 || (ncens > 0 && *n < 20)) {	*ifault = 4; return;
193     }
194     if (ncens > 0) {
195 	delta = (double) ncens / an;
196 	if (delta > .8f) {	*ifault = 5; return;
197 	}
198     } /* just for -Wall:*/ else { delta = 0.f; }
199 
200     --a; /* so we can keep using 1-based indices */
201 
202 /*	If INIT is false (always when called from R),
203  *	calculate coefficients a[] for the test statistic W */
204     if (! (*init)) {
205 	if (*n == 3) {
206 		const static double sqrth = .70710678f;/* = sqrt(1/2), was .70711f */
207 	    a[1] = sqrth;
208 	} else {
209 	    an25 = an + .25;
210 	    summ2 = zero;
211 	    for (i = 1; i <= *n2; ++i) {
212 			a[i] = gsl_cdf_ugaussian_Pinv ((i - .375f) / an25);
213 			r__1 = a[i];
214 			summ2 += r__1 * r__1;
215 	    }
216 	    summ2 *= two;
217 	    ssumm2 = sqrt(summ2);
218 	    rsn = one / sqrt(an);
219 	    a1 = poly(c1, 6, rsn) - a[1] / ssumm2;
220 
221 	    /* Normalize a[] */
222 	    if (*n > 5) {
223 		i1 = 3;
224 		a2 = -a[2] / ssumm2 + poly(c2, 6, rsn);
225 		fac = sqrt((summ2 - two * (a[1] * a[1]) - two * (a[2] * a[2]))
226 			 / (one - two * (a1 * a1) - two * (a2 * a2)));
227 		a[2] = a2;
228 	    } else {
229 		i1 = 2;
230 		fac = sqrt((summ2 - two * (a[1] * a[1])) /
231 			   ( one  - two * (a1 * a1)));
232 	    }
233 	    a[1] = a1;
234 	    for (i = i1; i <= nn2; ++i)
235 		a[i] /= - fac;
236 	}
237 	*init = (1);
238     }
239 
240 /*	If W is input as negative, calculate significance level of -W */
241 
242     if (*w < zero) {
243 	w1 = 1. + *w;
244 	*ifault = 0;
245 	goto L70;
246     }
247 
248 /*	Check for zero range */
249 
250     range = x[*n1 - 1] - x[0];
251     if (range < small) {
252 	*ifault = 6;	return;
253     }
254 
255 /*	Check for correct sort order on range - scaled X */
256 
257     /* *ifault = 7; <-- a no-op, since it is changed below, in ANY CASE! */
258     *ifault = 0;
259     xx = x[0] / range;
260     sx = xx;
261     sa = -a[1];
262     j = *n - 1;
263     for (i = 1; i < *n1; --j) {
264 	xi = x[i] / range;
265 	if (xx - xi > small) {
266 	    /* Fortran had:	 print *, "ANYTHING"
267 	     * but do NOT; it *does* happen with sorted x (on Intel GNU/linux 32bit):
268 	     *  shapiro.test(c(-1.7, -1,-1,-.73,-.61,-.5,-.24, .45,.62,.81,1))
269 	     */
270 	    *ifault = 7;
271 	}
272 	sx += xi;
273 	++i;
274 	if (i != j)
275 		sa += sign(i - j) * a[min(i,j)];
276 	xx = xi;
277     }
278     if (*n > 5000) {
279 	*ifault = 2;
280     }
281 
282 /*	Calculate W statistic as squared correlation
283 	between data and coefficients */
284 
285     sa /= *n1;
286     sx /= *n1;
287     ssa = ssx = sax = zero;
288     j = *n - 1;
289     for (i = 0; i < *n1; ++i, --j) {
290 	if (i != j)
291 		asa = sign(i - j) * a[1+min(i,j)] - sa;
292 	else
293 	    asa = -sa;
294 	xsx = x[i] / range - sx;
295 	ssa += asa * asa;
296 	ssx += xsx * xsx;
297 	sax += asa * xsx;
298     }
299 
300 /*	W1 equals (1-W) calculated to avoid excessive rounding error
301 	for W very near 1 (a potential problem in very large samples) */
302 
303     ssassx = sqrt(ssa * ssx);
304     w1 = (ssassx - sax) * (ssassx + sax) / (ssa * ssx);
305 L70:
306     *w = 1. - w1;
307 
308 /*	Calculate significance level for W */
309 
310     if (*n == 3) {/* exact P value : */
311 	const static double pi6 = 1.90985931710274;/* = 6/pi, was  1.909859f */
312 	const static double stqr= 1.04719755119660;/* = asin(sqrt(3/4)), was 1.047198f */
313 	*pw = pi6 * (asin(sqrt(*w)) - stqr);
314 	if(*pw < 0.) *pw = 0.;
315 	return;
316     }
317     y = log(w1);
318     xx = log(an);
319     if (*n <= 11) {
320 	gamma = poly(g, 2, an);
321 	if (y >= gamma) {
322 	    *pw = 1e-99;/* an "obvious" value, was 'small' which was 1e-19f */
323 	    return;
324 	}
325 	y = -log(gamma - y);
326 	m = poly(c3, 4, an);
327 	s = exp(poly(c4, 4, an));
328     } else {/* n >= 12 */
329 	m = poly(c5, 4, xx);
330 	s = exp(poly(c6, 3, xx));
331     }
332     /*DBG printf("c(w1=%g, w=%g, y=%g, m=%g, s=%g)\n",w1,*w,y,m,s); */
333 
334     if (ncens > 0) {/* <==>  n > n1 --- not happening currently when called from R */
335 
336 /*	Censoring by proportion NCENS/N.
337 	Calculate mean and sd of normal equivalent deviate of W. */
338 
339 	const static double three = 3.f;
340 
341 	const static double z90 = 1.2816f;
342 	const static double z95 = 1.6449f;
343 	const static double z99 = 2.3263f;
344 	const static double zm = 1.7509f;
345 	const static double zss = .56268f;
346 	const static double bf1 = .8378f;
347 
348 	const static double xx90 = .556;
349 	const static double xx95 = .622;
350 
351 	ld = -log(delta);
352 	bf = one + xx * bf1;
353 	r__1 = pow(xx90, (double) xx);
354 	z90f = z90 + bf * pow(poly(c7, 2, r__1), (double) ld);
355 	r__1 = pow(xx95, (double) xx);
356 	z95f = z95 + bf * pow(poly(c8, 2, r__1), (double) ld);
357 	z99f = z99 + bf * pow(poly(c9, 2, xx), (double)ld);
358 
359 /*	Regress Z90F,...,Z99F on normal deviates Z90,...,Z99 to get
360 	pseudo-mean and pseudo-sd of z as the slope and intercept */
361 
362 	zfm = (z90f + z95f + z99f) / three;
363 	zsd = (z90 * (z90f - zfm) +
364 	       z95 * (z95f - zfm) + z99 * (z99f - zfm)) / zss;
365 	zbar = zfm - zsd * zm;
366 	m += zbar * s;
367 	s *= zsd;
368     }
369 	//*pw = pnorm((double) y, (double)m, (double)s, 0/* upper tail */, 0);
370 	*pw = gsl_cdf_gaussian_Q(y - m, s);
371     /*  = alnorm_(dble((Y - M)/S), 1); */
372 
373     return;
374 } /* swilk */
375 
poly(const double * cc,int nord,double x)376 double ShapiroWilkTest::poly(const double *cc, int nord, double x)
377 {
378 /* Algorithm AS 181.2	Appl. Statist.	(1982) Vol. 31, No. 2
379 
380 	Calculates the algebraic polynomial of order nord-1 with
381 	array of coefficients cc.  Zero order coefficient is cc(1) = cc[0]
382 */
383     /* Local variables */
384     int j;
385     double p, ret_val;/* preserve precision! */
386 
387     ret_val = cc[0];
388     if (nord > 1) {
389 	p = x * cc[nord-1];
390 	for (j = nord - 2; j > 0; j--)
391 	    p = (p + cc[j]) * x;
392 
393 	ret_val += p;
394     }
395     return ret_val;
396 } /* poly */
397