1 /*
2 Copyright (C) 2017,2018 John E. Davis
3 
4 This file is part of the S-Lang Library.
5 
6 The S-Lang Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU General Public License as
8 published by the Free Software Foundation; either version 2 of the
9 License, or (at your option) any later version.
10 
11 The S-Lang Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 General Public License for more details.
15 
16 You should have received a copy of the GNU General Public License
17 along with this library; if not, write to the Free Software
18 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
19 USA.
20 */
21 #include "config.h"
22 #include <stdio.h>
23 #include <stdlib.h>
24 #include <string.h>
25 #include <math.h>
26 #include <slang.h>
27 
28 #include "_slint.h"
29 #include "stats-module.h"
30 
alnorm(double x,int use_upper)31 static double alnorm (double x, int use_upper)
32 {
33    double cdf;
34    cdf = 0.5 * (1.0 + erf (x/sqrt(2.0)));
35    if (use_upper) return 1.0 - cdf;
36    return cdf;
37 }
38 
prtaus_large_n(_pSLint64_Type is,_pSLint64_Type n,double * probp)39 static int prtaus_large_n (_pSLint64_Type is, _pSLint64_Type n, double *probp)
40 {
41    double h[15];
42    double x, r, sc, p;
43    int i;
44 
45    /* PROBABILITIES CALCULATED BY MODIFIED EDGEWORTH SERIES FOR N GREATER THAN 8 */
46    /* CALCULATION OF TCHEBYCHEFF-HERMITE POLYNOMIALS */
47    x = (is-1.0) / sqrt((6.0 + n*(5.0-n*(3.0+2*n)))/(-18.0));
48    h[0] = x;
49    h[1] = x*x - 1.0;
50    for (i = 2; i < 15; i++)
51      {
52 	h[i] = x * h[i-1] - (i-1.0)*h[i-2];
53      }
54 
55    r = 1.0 / n;
56    sc = r*(h[2]*(r*(r*(r*0.506f - 0.5325f) + 0.045f) - 0.09f)
57 	   + r*(h[4]*(r*(r*0.3214f - 0.036735f) + 0.036735f)
58 		+ h[6]*(r*(r*0.07787f - 0.023336f) + 0.00405f)
59 		+ r*(h[8]*(-.0033061f - r*0.0065166f)
60 		     + h[10]*(r*0.0025927f - 1.215e-4f)
61 		     + r*(h[12]*1.4878e-4f + h[14]*2.7338e-6f))));
62 
63    p = alnorm (x,1) + sc*0.398942*exp(-0.5*x*x);
64    if (p < 0.0) p = 0.0;
65    if (p > 1.0) p = 1.0;
66    *probp = p;
67 
68    return 0;
69 }
70 
71 
72 /*
73  * ALGORITHM AS 71  APPL. STATIST. (1974) VOL.23, NO.1
74  *
75  * GIVEN A VALUE OF IS CALCULATED FROM TWO RANKINGS (WITHOUT TIES)
76  * OF N OBJECTS, THE FUNCTION COMPUTES THE PROBABILITY OF
77  * OBTAINING A VALUE GREATER THAN, OR EQUAL TO, IS.
78  *
79  */
prtaus(_pSLint64_Type is,_pSLint64_Type n,double * probp)80 static int prtaus (_pSLint64_Type is, _pSLint64_Type n, double *probp)
81 {
82 #define MAX_N_EXACT 30
83 #define MAX_L_COLS ((MAX_N_EXACT)*(MAX_N_EXACT-1)/4 + 2)
84    _pSLint64_Type row0[MAX_L_COLS];  /* allow 1-based indexing */
85    _pSLint64_Type row1[MAX_L_COLS];  /* allow 1-based indexing */
86    _pSLint64_Type *curr, *prev;
87    _pSLint64_Type i, il, k, m, im;
88 
89    if (n > MAX_N_EXACT)
90      return prtaus_large_n (is, n, probp);
91 
92    /* Use recurrence relation for n <= MAX_N_EXACT */
93 
94    *probp = 1.0;
95 
96    m = n*(n-1)/2;
97    if (is >= 0) m -= is; else m += is;
98    if ((m == 0) && (is <= 0)) return 0;
99 
100    if (is < 0) m = m - 2;
101    im = m/2;
102 
103    memset (row0, 0, (im+1)*sizeof(_pSLint64_Type));
104    memset (row1, 0, (im+1)*sizeof(_pSLint64_Type));
105    prev = row0; prev[0] = 1;
106    curr = row1; curr[0] = 1;
107 
108    il = 0;
109    i = 1;
110    m = 1;
111    while (i < n)
112      {
113 	_pSLint64_Type *tmp;
114 	_pSLint64_Type in, io;
115 
116 	tmp = curr; curr = prev; prev = tmp;
117         il += i;
118         i++;
119         m = m*i;
120 	k = (im < il) ? im : il;
121 
122 	k++;
123 	in = 1;
124 	io = (i <= k) ? i : k;
125 	while (in < io)
126 	  {
127 	     curr[in] = curr[in-1] + prev[in];
128 	     in++;
129 	  }
130 	io = 0;
131 	while (in < k)
132 	  {
133 	     curr[in] = curr[in-1] + prev[in] - prev[io];
134 	     io++;
135 	     in++;
136 	  }
137      }
138 
139    k = 0;
140    for (i = 0; i <= im; i++)
141      k += curr[i];
142 
143    *probp = ((double) k)/m;
144    if (is < 0) *probp = 1.0 - *probp;
145 
146    return 0;
147 }
148 
kendall_insertion_sort(SLindex_Type * arr,size_t num)149 static _pSLuint64_Type kendall_insertion_sort (SLindex_Type *arr, size_t num)
150 {
151    size_t maxj, i;
152    _pSLuint64_Type nexch;
153 
154    if (num < 2) return 0;
155 
156    nexch = 0;
157    i = maxj = num - 1;
158    while (i--)
159      {
160         size_t j = i;
161         SLindex_Type val = arr[i];
162 
163 	while ((j < maxj) && (arr[j+1] < val))
164 	  {
165 	     arr[j] = arr[j+1];
166 	     j++;
167 	  }
168         arr[j] = val;
169 	nexch += (j - i);
170     }
171    return nexch;
172 }
173 
kendall_merge(SLindex_Type * left,size_t left_num,SLindex_Type * right,size_t right_num,SLindex_Type * work)174 static _pSLuint64_Type kendall_merge (SLindex_Type *left, size_t left_num,
175 			    SLindex_Type *right, size_t right_num,
176 			    SLindex_Type *work)
177 {
178    _pSLuint64_Type nexch = 0;
179 
180    while (left_num && right_num)
181      {
182         if (*right < *left)
183 	  {
184 	     *work++ = *right++;
185 	     right_num--;
186 	     nexch += left_num;
187 	     continue;
188 	  }
189 
190 	*work++ = *left++;
191 	left_num--;
192      }
193 
194    if (left_num)
195      memcpy(work, left, left_num * sizeof(SLindex_Type));
196    else if (right_num)
197      memcpy(work, right, right_num * sizeof(SLindex_Type));
198 
199    return nexch;
200 }
201 
kendall_merge_sort(SLindex_Type * a,size_t num,SLindex_Type * work)202 static _pSLuint64_Type kendall_merge_sort(SLindex_Type *a, size_t num, SLindex_Type *work)
203 {
204    _pSLuint64_Type nexch;
205    size_t left_num, right_num;
206    SLindex_Type *left, *right;
207 
208    if (num < 8)
209      return kendall_insertion_sort (a, num);
210 
211    left = a;
212    left_num = num/2;
213    right = a + left_num;
214    right_num = num - left_num;
215 
216    nexch = kendall_merge_sort(left, left_num, work);
217    nexch += kendall_merge_sort(right, right_num, work);
218    nexch += kendall_merge(left, left_num, right, right_num, work);
219 
220    memcpy(a, work, num * sizeof(SLindex_Type));
221    return nexch;
222 }
223 
224 /* This also computes the quantities needed for the p-value.  From wikipedia:
225  *   v = sum(x*(x-1)*(2*x+5))
226  *   v1 = sum (x*(x-1))
227  *   v2 = sum(x*(x-1)*(x-2))
228  * where x is the number of tied values in a group.
229  */
230 static _pSLuint64_Type
kendall_count_tied_pairs(SLindex_Type * a,size_t num,_pSLuint64_Type * v,_pSLuint64_Type * v1,_pSLuint64_Type * v2)231 kendall_count_tied_pairs (SLindex_Type *a, size_t num,
232 			  _pSLuint64_Type *v, _pSLuint64_Type *v1, _pSLuint64_Type *v2)
233 {
234    _pSLuint64_Type n = 0;
235    size_t i;
236 
237    i = 1;
238    while (i < num)
239      {
240 	_pSLuint64_Type di, dn;
241 	size_t i0;
242 
243 	if (a[i] != a[i-1])
244 	  {
245 	     i++;
246 	     continue;
247 	  }
248 
249 	/* In a group with ties */
250 	i0 = i-1;
251 	i++;
252 	while ((i < num) && (a[i] == a[i-1]))
253 	  i++;
254 
255 	di = i-i0;
256 	dn = di*(di-1);
257 	*v1 += dn;
258 	*v2 += dn*(di-2);
259 	*v += dn * (2*di+5);
260 	n += dn/2;
261 
262 	i++;
263      }
264 
265    return n;
266 }
267 
268 /*
269  * This function assumes that the input arrays are sorted on the first array.
270  * That is, the slang code that wraps this will have to do:
271  *
272  *    i = array_sort (a);
273  *    a = a[i]; b = [i];
274  *
275  * The basic idea is the following:
276  *
277  * The total number of pairs formed from an array of size N is
278  * n0 = N*(N-1)/2.  Consider 2 such arrays A and B.  Then consider the ith and
279  * jth elemnts of the arrays, where j>i.  One of the following will be true:
280  *
281  *     A[i]>A[j] and B[i]>B[j]     "concordent"
282  *     A[i]<A[j] and B[i]<B[j]     "concordent"
283  *     A[i]>A[j] and B[i]<B[j]     "disconcordent"
284  *     A[i]<A[j] and B[i]>B[j]     "disconcordent"
285  *     A[i]=A[j] and B[i]!=B[j]    "A is tied"
286  *     A[i]!=A[j] and B[i]=B[j]    "B is tied"
287  *     A[i]=A[j] and B[i]=B[j]     "joint tie"
288  *
289  * Then: n0 = nc + nd + (t + v) + u
290  *       n0 = nc + nd + (u + v) + t
291  * where nc=num concordent, nd = num disconcordent, t=A ties, u=B ties, v=joint ties
292  *
293  * Let t+v = nA = total number of ties in A (includes joint)
294  *     u+v = nB = total number of ties in B (includes joint)
295  * Then: n0 = nc + nd + nA + u
296  *
297  *   nc + nd = n0 - nA - u
298  *           = n0 - nA - (nB - v)
299  *           = n0 + v - (nA + nB)
300  * Knight indicates that nd is equal to the number of exchanges (ne) in sorting B.
301  * Hence:
302  *
303  *   nc - nd = (nc + nd) - 2*ne
304  *           = (n0 + v) - (nA + nB + 2*ne)
305  */
_pSLstats_kendall_tau(SLindex_Type * a,SLindex_Type * b,size_t num,double * taup)306 double _pSLstats_kendall_tau (SLindex_Type *a, SLindex_Type *b, size_t num, double *taup)
307 {
308    double tau, sigma, prob;
309    _pSLuint64_Type n0, na, nb, ne, v;
310    _pSLuint64_Type va, va1, va2, vb, vb1, vb2;
311    size_t i;
312 
313    n0 = num;
314    n0 = n0*(n0-1)/2;
315 
316    na = v = 0;
317    va = va1 = va2 = 0;
318    vb = vb1 = vb2 = 0;
319 
320    i = 1;
321    while (i < num)
322      {
323 	_pSLuint64_Type di;
324 	size_t i0;
325 
326 	if (a[i-1] != a[i])
327 	  {
328 	     i++;
329 	     continue;
330 	  }
331 
332 	i0 = i - 1;
333 	i++;
334 	while ((i < num) && (a[i-1] == a[i]))
335 	  i++;
336 
337 	di = i-i0;
338 	na += di*(di-1)/2;
339 
340 	(void) kendall_insertion_sort (b+i0, di);
341 	v += kendall_count_tied_pairs(b+i0, di, &va, &va1, &va2);
342 	i++;			       /* ok to do since if i<num, the a[i-1]!=a[i] */
343      }
344 
345    /* Sort b, using a as workspace */
346    ne = kendall_merge_sort (b, num, a);
347    nb = kendall_count_tied_pairs (b, num, &vb, &vb1, &vb2);
348 
349    /* Of no ties, use exact probability distribution */
350    if ((na == 0) && (nb == 0))
351      {
352 	_pSLint64_Type is;
353 
354 	if (n0 < 2*ne)
355 	  is = -(_pSLint64_Type)(2*ne - n0);
356 	else
357 	  is = (_pSLint64_Type)(n0 - 2*ne);
358 
359 	*taup = ((double)is)/n0;
360 
361 	/* prob is probability of getting a statistic >= is. */
362 	(void) prtaus (is, num, &prob);
363 	prob = 1.0 - prob;	       /* prob of statistic < is */
364 	/* fprintf (stdout, "prtaus: num=%lu, is=%lu, prob=%g\n", num, is, prob); */
365 	return prob;
366      }
367 
368    /* Otherwise, use normal distributuon */
369    tau = (n0 + v - na - nb - ne);      /* This should be >= 0 */
370    tau -= ne;			       /* may be < 0 */
371 
372    /* From wikipedia */
373    sigma = (n0*(4.0*num+10.0) - va - vb)/18.0;
374    sigma += va1*(double)vb1/(4.0*n0);
375    sigma += va2*(double)vb2/(18.0*n0*(num-2));
376    sigma = sqrt (sigma);
377 
378    *taup = tau/sqrt(n0-na)/sqrt(n0-nb);  /* avoid overflow */
379 
380    /* To compute the probability use the continuity correction recommended by Kendall */
381    if (tau > 0) tau -= 1; else if (tau < 0) tau += 1;
382    return 0.5 * (1.0 + erf (tau/sigma/sqrt(2.0)));
383 }
384