1 /*
2 Copyright (C) 2017,2018 John E. Davis
3
4 This file is part of the S-Lang Library.
5
6 The S-Lang Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU General Public License as
8 published by the Free Software Foundation; either version 2 of the
9 License, or (at your option) any later version.
10
11 The S-Lang Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 General Public License for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with this library; if not, write to the Free Software
18 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
19 USA.
20 */
21 #include "config.h"
22 #include <stdio.h>
23 #include <stdlib.h>
24 #include <string.h>
25 #include <math.h>
26 #include <slang.h>
27
28 #include "_slint.h"
29 #include "stats-module.h"
30
alnorm(double x,int use_upper)31 static double alnorm (double x, int use_upper)
32 {
33 double cdf;
34 cdf = 0.5 * (1.0 + erf (x/sqrt(2.0)));
35 if (use_upper) return 1.0 - cdf;
36 return cdf;
37 }
38
prtaus_large_n(_pSLint64_Type is,_pSLint64_Type n,double * probp)39 static int prtaus_large_n (_pSLint64_Type is, _pSLint64_Type n, double *probp)
40 {
41 double h[15];
42 double x, r, sc, p;
43 int i;
44
45 /* PROBABILITIES CALCULATED BY MODIFIED EDGEWORTH SERIES FOR N GREATER THAN 8 */
46 /* CALCULATION OF TCHEBYCHEFF-HERMITE POLYNOMIALS */
47 x = (is-1.0) / sqrt((6.0 + n*(5.0-n*(3.0+2*n)))/(-18.0));
48 h[0] = x;
49 h[1] = x*x - 1.0;
50 for (i = 2; i < 15; i++)
51 {
52 h[i] = x * h[i-1] - (i-1.0)*h[i-2];
53 }
54
55 r = 1.0 / n;
56 sc = r*(h[2]*(r*(r*(r*0.506f - 0.5325f) + 0.045f) - 0.09f)
57 + r*(h[4]*(r*(r*0.3214f - 0.036735f) + 0.036735f)
58 + h[6]*(r*(r*0.07787f - 0.023336f) + 0.00405f)
59 + r*(h[8]*(-.0033061f - r*0.0065166f)
60 + h[10]*(r*0.0025927f - 1.215e-4f)
61 + r*(h[12]*1.4878e-4f + h[14]*2.7338e-6f))));
62
63 p = alnorm (x,1) + sc*0.398942*exp(-0.5*x*x);
64 if (p < 0.0) p = 0.0;
65 if (p > 1.0) p = 1.0;
66 *probp = p;
67
68 return 0;
69 }
70
71
72 /*
73 * ALGORITHM AS 71 APPL. STATIST. (1974) VOL.23, NO.1
74 *
75 * GIVEN A VALUE OF IS CALCULATED FROM TWO RANKINGS (WITHOUT TIES)
76 * OF N OBJECTS, THE FUNCTION COMPUTES THE PROBABILITY OF
77 * OBTAINING A VALUE GREATER THAN, OR EQUAL TO, IS.
78 *
79 */
prtaus(_pSLint64_Type is,_pSLint64_Type n,double * probp)80 static int prtaus (_pSLint64_Type is, _pSLint64_Type n, double *probp)
81 {
82 #define MAX_N_EXACT 30
83 #define MAX_L_COLS ((MAX_N_EXACT)*(MAX_N_EXACT-1)/4 + 2)
84 _pSLint64_Type row0[MAX_L_COLS]; /* allow 1-based indexing */
85 _pSLint64_Type row1[MAX_L_COLS]; /* allow 1-based indexing */
86 _pSLint64_Type *curr, *prev;
87 _pSLint64_Type i, il, k, m, im;
88
89 if (n > MAX_N_EXACT)
90 return prtaus_large_n (is, n, probp);
91
92 /* Use recurrence relation for n <= MAX_N_EXACT */
93
94 *probp = 1.0;
95
96 m = n*(n-1)/2;
97 if (is >= 0) m -= is; else m += is;
98 if ((m == 0) && (is <= 0)) return 0;
99
100 if (is < 0) m = m - 2;
101 im = m/2;
102
103 memset (row0, 0, (im+1)*sizeof(_pSLint64_Type));
104 memset (row1, 0, (im+1)*sizeof(_pSLint64_Type));
105 prev = row0; prev[0] = 1;
106 curr = row1; curr[0] = 1;
107
108 il = 0;
109 i = 1;
110 m = 1;
111 while (i < n)
112 {
113 _pSLint64_Type *tmp;
114 _pSLint64_Type in, io;
115
116 tmp = curr; curr = prev; prev = tmp;
117 il += i;
118 i++;
119 m = m*i;
120 k = (im < il) ? im : il;
121
122 k++;
123 in = 1;
124 io = (i <= k) ? i : k;
125 while (in < io)
126 {
127 curr[in] = curr[in-1] + prev[in];
128 in++;
129 }
130 io = 0;
131 while (in < k)
132 {
133 curr[in] = curr[in-1] + prev[in] - prev[io];
134 io++;
135 in++;
136 }
137 }
138
139 k = 0;
140 for (i = 0; i <= im; i++)
141 k += curr[i];
142
143 *probp = ((double) k)/m;
144 if (is < 0) *probp = 1.0 - *probp;
145
146 return 0;
147 }
148
kendall_insertion_sort(SLindex_Type * arr,size_t num)149 static _pSLuint64_Type kendall_insertion_sort (SLindex_Type *arr, size_t num)
150 {
151 size_t maxj, i;
152 _pSLuint64_Type nexch;
153
154 if (num < 2) return 0;
155
156 nexch = 0;
157 i = maxj = num - 1;
158 while (i--)
159 {
160 size_t j = i;
161 SLindex_Type val = arr[i];
162
163 while ((j < maxj) && (arr[j+1] < val))
164 {
165 arr[j] = arr[j+1];
166 j++;
167 }
168 arr[j] = val;
169 nexch += (j - i);
170 }
171 return nexch;
172 }
173
kendall_merge(SLindex_Type * left,size_t left_num,SLindex_Type * right,size_t right_num,SLindex_Type * work)174 static _pSLuint64_Type kendall_merge (SLindex_Type *left, size_t left_num,
175 SLindex_Type *right, size_t right_num,
176 SLindex_Type *work)
177 {
178 _pSLuint64_Type nexch = 0;
179
180 while (left_num && right_num)
181 {
182 if (*right < *left)
183 {
184 *work++ = *right++;
185 right_num--;
186 nexch += left_num;
187 continue;
188 }
189
190 *work++ = *left++;
191 left_num--;
192 }
193
194 if (left_num)
195 memcpy(work, left, left_num * sizeof(SLindex_Type));
196 else if (right_num)
197 memcpy(work, right, right_num * sizeof(SLindex_Type));
198
199 return nexch;
200 }
201
kendall_merge_sort(SLindex_Type * a,size_t num,SLindex_Type * work)202 static _pSLuint64_Type kendall_merge_sort(SLindex_Type *a, size_t num, SLindex_Type *work)
203 {
204 _pSLuint64_Type nexch;
205 size_t left_num, right_num;
206 SLindex_Type *left, *right;
207
208 if (num < 8)
209 return kendall_insertion_sort (a, num);
210
211 left = a;
212 left_num = num/2;
213 right = a + left_num;
214 right_num = num - left_num;
215
216 nexch = kendall_merge_sort(left, left_num, work);
217 nexch += kendall_merge_sort(right, right_num, work);
218 nexch += kendall_merge(left, left_num, right, right_num, work);
219
220 memcpy(a, work, num * sizeof(SLindex_Type));
221 return nexch;
222 }
223
224 /* This also computes the quantities needed for the p-value. From wikipedia:
225 * v = sum(x*(x-1)*(2*x+5))
226 * v1 = sum (x*(x-1))
227 * v2 = sum(x*(x-1)*(x-2))
228 * where x is the number of tied values in a group.
229 */
230 static _pSLuint64_Type
kendall_count_tied_pairs(SLindex_Type * a,size_t num,_pSLuint64_Type * v,_pSLuint64_Type * v1,_pSLuint64_Type * v2)231 kendall_count_tied_pairs (SLindex_Type *a, size_t num,
232 _pSLuint64_Type *v, _pSLuint64_Type *v1, _pSLuint64_Type *v2)
233 {
234 _pSLuint64_Type n = 0;
235 size_t i;
236
237 i = 1;
238 while (i < num)
239 {
240 _pSLuint64_Type di, dn;
241 size_t i0;
242
243 if (a[i] != a[i-1])
244 {
245 i++;
246 continue;
247 }
248
249 /* In a group with ties */
250 i0 = i-1;
251 i++;
252 while ((i < num) && (a[i] == a[i-1]))
253 i++;
254
255 di = i-i0;
256 dn = di*(di-1);
257 *v1 += dn;
258 *v2 += dn*(di-2);
259 *v += dn * (2*di+5);
260 n += dn/2;
261
262 i++;
263 }
264
265 return n;
266 }
267
268 /*
269 * This function assumes that the input arrays are sorted on the first array.
270 * That is, the slang code that wraps this will have to do:
271 *
272 * i = array_sort (a);
273 * a = a[i]; b = [i];
274 *
275 * The basic idea is the following:
276 *
277 * The total number of pairs formed from an array of size N is
278 * n0 = N*(N-1)/2. Consider 2 such arrays A and B. Then consider the ith and
279 * jth elemnts of the arrays, where j>i. One of the following will be true:
280 *
281 * A[i]>A[j] and B[i]>B[j] "concordent"
282 * A[i]<A[j] and B[i]<B[j] "concordent"
283 * A[i]>A[j] and B[i]<B[j] "disconcordent"
284 * A[i]<A[j] and B[i]>B[j] "disconcordent"
285 * A[i]=A[j] and B[i]!=B[j] "A is tied"
286 * A[i]!=A[j] and B[i]=B[j] "B is tied"
287 * A[i]=A[j] and B[i]=B[j] "joint tie"
288 *
289 * Then: n0 = nc + nd + (t + v) + u
290 * n0 = nc + nd + (u + v) + t
291 * where nc=num concordent, nd = num disconcordent, t=A ties, u=B ties, v=joint ties
292 *
293 * Let t+v = nA = total number of ties in A (includes joint)
294 * u+v = nB = total number of ties in B (includes joint)
295 * Then: n0 = nc + nd + nA + u
296 *
297 * nc + nd = n0 - nA - u
298 * = n0 - nA - (nB - v)
299 * = n0 + v - (nA + nB)
300 * Knight indicates that nd is equal to the number of exchanges (ne) in sorting B.
301 * Hence:
302 *
303 * nc - nd = (nc + nd) - 2*ne
304 * = (n0 + v) - (nA + nB + 2*ne)
305 */
_pSLstats_kendall_tau(SLindex_Type * a,SLindex_Type * b,size_t num,double * taup)306 double _pSLstats_kendall_tau (SLindex_Type *a, SLindex_Type *b, size_t num, double *taup)
307 {
308 double tau, sigma, prob;
309 _pSLuint64_Type n0, na, nb, ne, v;
310 _pSLuint64_Type va, va1, va2, vb, vb1, vb2;
311 size_t i;
312
313 n0 = num;
314 n0 = n0*(n0-1)/2;
315
316 na = v = 0;
317 va = va1 = va2 = 0;
318 vb = vb1 = vb2 = 0;
319
320 i = 1;
321 while (i < num)
322 {
323 _pSLuint64_Type di;
324 size_t i0;
325
326 if (a[i-1] != a[i])
327 {
328 i++;
329 continue;
330 }
331
332 i0 = i - 1;
333 i++;
334 while ((i < num) && (a[i-1] == a[i]))
335 i++;
336
337 di = i-i0;
338 na += di*(di-1)/2;
339
340 (void) kendall_insertion_sort (b+i0, di);
341 v += kendall_count_tied_pairs(b+i0, di, &va, &va1, &va2);
342 i++; /* ok to do since if i<num, the a[i-1]!=a[i] */
343 }
344
345 /* Sort b, using a as workspace */
346 ne = kendall_merge_sort (b, num, a);
347 nb = kendall_count_tied_pairs (b, num, &vb, &vb1, &vb2);
348
349 /* Of no ties, use exact probability distribution */
350 if ((na == 0) && (nb == 0))
351 {
352 _pSLint64_Type is;
353
354 if (n0 < 2*ne)
355 is = -(_pSLint64_Type)(2*ne - n0);
356 else
357 is = (_pSLint64_Type)(n0 - 2*ne);
358
359 *taup = ((double)is)/n0;
360
361 /* prob is probability of getting a statistic >= is. */
362 (void) prtaus (is, num, &prob);
363 prob = 1.0 - prob; /* prob of statistic < is */
364 /* fprintf (stdout, "prtaus: num=%lu, is=%lu, prob=%g\n", num, is, prob); */
365 return prob;
366 }
367
368 /* Otherwise, use normal distributuon */
369 tau = (n0 + v - na - nb - ne); /* This should be >= 0 */
370 tau -= ne; /* may be < 0 */
371
372 /* From wikipedia */
373 sigma = (n0*(4.0*num+10.0) - va - vb)/18.0;
374 sigma += va1*(double)vb1/(4.0*n0);
375 sigma += va2*(double)vb2/(18.0*n0*(num-2));
376 sigma = sqrt (sigma);
377
378 *taup = tau/sqrt(n0-na)/sqrt(n0-nb); /* avoid overflow */
379
380 /* To compute the probability use the continuity correction recommended by Kendall */
381 if (tau > 0) tau -= 1; else if (tau < 0) tau += 1;
382 return 0.5 * (1.0 + erf (tau/sigma/sqrt(2.0)));
383 }
384