1 /*-------------------------------------------------------------------------*/
2 /* This file contains code to calculate Kendall's Tau in O(N log N) time in
3  * a manner similar to the following reference:
4  *
5  * A Computer Method for Calculating Kendall's Tau with Ungrouped Data
6  * William R. Knight Journal of the American Statistical Association,
7  * Vol. 61, No. 314, Part 1 (Jun., 1966), pp. 436-439
8  *
9  * Copyright 2010 David Simcha
10  *
11  * License:
12  * Boost Software License - Version 1.0 - August 17th, 2003
13  *
14  * Permission is hereby granted, free of charge, to any person or organization
15  * obtaining a copy of the software and accompanying documentation covered by
16  * this license (the "Software") to use, reproduce, display, distribute,
17  * execute, and transmit the Software, and to prepare derivative works of the
18  * Software, and to permit third-parties to whom the Software is furnished to
19  * do so, all subject to the following:
20  *
21  * The copyright notices in the Software and this entire statement, including
22  * the above license grant, this restriction and the following disclaimer,
23  * must be included in all copies of the Software, in whole or in part, and
24  * all derivative works of the Software, unless such copies or derivative
25  * works are solely in the form of machine-executable object code generated by
26  * a source language processor.
27  *
28  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
29  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
30  * FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
31  * SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
32  * FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
33  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
34  * DEALINGS IN THE SOFTWARE.
35  *
36  */
37 /*-------------------------------------------------------------------------*/
38 /**---------------- Adapted for AFNI by RWCox - April 2010 ---------------**/
39 /*-------------------------------------------------------------------------*/
40 
41 #include <stdlib.h>
42 #include <string.h>
43 #include <math.h>
44 #include <stdio.h>
45 #include <assert.h>
46 #include <time.h>
47 
48 #ifdef SOLARIS_OLD      /* 'if' might be overkill, but just to be minimal... */
49                         /*                               10 May 2010 [rickr] */
50 #include "machdep.h"
51 #else
52 #include <stdint.h>
53 #endif
54 
55 
56 /*-------------------------------------------------------------------------*/
57 /* Sorts in place, returns the bubble sort distance between the input array
58  * and the sorted array.
59  */
60 
insertionSort(float * arr,int len)61 static int insertionSort(float *arr, int len)
62 {
63     int maxJ, i,j , swapCount = 0;
64 
65 /* printf("enter insertionSort len=%d\n",len) ; */
66 
67     if(len < 2) { return 0; }
68 
69     maxJ = len - 1;
70     for(i = len - 2; i >= 0; --i) {
71         float  val = arr[i];
72         for(j=i; j < maxJ && arr[j + 1] < val; ++j) {
73             arr[j] = arr[j + 1];
74         }
75 
76         arr[j] = val;
77         swapCount += (j - i);
78     }
79 
80     return swapCount;
81 }
82 
83 /*-------------------------------------------------------------------------*/
84 
merge(float * from,float * to,int middle,int len)85 static int merge(float *from, float *to, int middle, int len)
86 {
87     int bufIndex, leftLen, rightLen , swaps ;
88     float *left , *right;
89 
90 /* printf("enter merge\n") ; */
91 
92     bufIndex = 0;
93     swaps = 0;
94 
95     left = from;
96     right = from + middle;
97     rightLen = len - middle;
98     leftLen = middle;
99 
100     while(leftLen && rightLen) {
101         if(right[0] < left[0]) {
102             to[bufIndex] = right[0];
103             swaps += leftLen;
104             rightLen--;
105             right++;
106         } else {
107             to[bufIndex] = left[0];
108             leftLen--;
109             left++;
110         }
111         bufIndex++;
112     }
113 
114     if(leftLen) {
115 #pragma omp critical (MEMCPY)
116         memcpy(to + bufIndex, left, leftLen * sizeof(float));
117     } else if(rightLen) {
118 #pragma omp critical (MEMCPY)
119         memcpy(to + bufIndex, right, rightLen * sizeof(float));
120     }
121 
122     return swaps;
123 }
124 
125 /*-------------------------------------------------------------------------*/
126 /* Sorts in place, returns the bubble sort distance between the input array
127  * and the sorted array.
128  */
129 
mergeSort(float * x,float * buf,int len)130 static int mergeSort(float *x, float *buf, int len)
131 {
132     int swaps , half ;
133 
134 /* printf("enter mergeSort\n") ; */
135 
136     if(len < 10) {
137         return insertionSort(x, len);
138     }
139 
140     swaps = 0;
141 
142     if(len < 2) { return 0; }
143 
144     half = len / 2;
145     swaps += mergeSort(x, buf, half);
146     swaps += mergeSort(x + half, buf + half, len - half);
147     swaps += merge(x, buf, half, len);
148 
149 #pragma omp critical (MEMCPY)
150     memcpy(x, buf, len * sizeof(float));
151     return swaps;
152 }
153 
154 /*-------------------------------------------------------------------------*/
155 
getMs(float * data,int len)156 static int getMs(float *data, int len)  /* Assumes data is sorted */
157 {
158     int Ms = 0, tieCount = 0 , i ;
159 
160 /* printf("enter getMs\n") ; */
161 
162     for(i = 1; i < len; i++) {
163         if(data[i] == data[i-1]) {
164             tieCount++;
165         } else if(tieCount) {
166             Ms += (tieCount * (tieCount + 1)) / 2;
167             tieCount = 0;
168         }
169     }
170     if(tieCount) {
171         Ms += (tieCount * (tieCount + 1)) / 2;
172     }
173     return Ms;
174 }
175 
176 /*-------------------------------------------------------------------------*/
177 /* This function calculates the Kendall correlation tau_b.
178  * The arrays arr1 should be sorted before this call, and arr2 should be
179  * re-ordered in lockstep.  This can be done by calling
180  *   qsort_floatfloat(len,arr1,arr2)
181  * for example.
182  * Note also that arr1 and arr2 will be modified, so if they need to
183  * be preserved, do so before calling this function.
184  */
185 
kendallNlogN(float * arr1,float * arr2,int len)186 float kendallNlogN( float *arr1, float *arr2, int len )
187 {
188     int m1 = 0, m2 = 0, tieCount, swapCount, nPair, s,i ;
189     float cor ;
190 
191 /* printf("enter kendallNlogN\n") ; */
192 
193     if( len < 2 ) return (float)0 ;
194 
195     nPair = len * (len - 1) / 2;
196     s = nPair;
197 
198     tieCount = 0;
199     for(i = 1; i < len; i++) {
200         if(arr1[i - 1] == arr1[i]) {
201             tieCount++;
202         } else if(tieCount > 0) {
203             insertionSort(arr2 + i - tieCount - 1, tieCount + 1);
204             m1 += tieCount * (tieCount + 1) / 2;
205             s += getMs(arr2 + i - tieCount - 1, tieCount + 1);
206             tieCount = 0;
207         }
208     }
209     if(tieCount > 0) {
210         insertionSort(arr2 + i - tieCount - 1, tieCount + 1);
211         m1 += tieCount * (tieCount + 1) / 2;
212         s += getMs(arr2 + i - tieCount - 1, tieCount + 1);
213     }
214 
215     swapCount = mergeSort(arr2, arr1, len);
216 
217     m2 = getMs(arr2, len);
218     s -= (m1 + m2) + 2 * swapCount;
219 
220     if( m1 < nPair && m2 < nPair )
221       cor = s / ( sqrtf((float)(nPair-m1)) * sqrtf((float)(nPair-m2)) ) ;
222     else
223       cor = 0.0f ;
224 
225     return cor ;
226 }
227 
228 /*-------------------------------------------------------------------------*/
229 /* This function uses a simple O(N^2) implementation.  It probably has a
230  * smaller constant and therefore is useful in the small N case, and is also
231  * useful for testing the relatively complex O(N log N) implementation.
232  */
233 
kendallSmallN(float * arr1,float * arr2,int len)234 float kendallSmallN( float *arr1, float *arr2, int len )
235 {
236     int m1 = 0, m2 = 0, s = 0, nPair , i,j ;
237     float cor ;
238 
239 /* printf("enter kendallSmallN\n") ; */
240 
241     for(i = 0; i < len; i++) {
242         for(j = i + 1; j < len; j++) {
243             if(arr2[i] > arr2[j]) {
244                 if (arr1[i] > arr1[j]) {
245                     s++;
246                 } else if(arr1[i] < arr1[j]) {
247                     s--;
248                 } else {
249                     m1++;
250                 }
251             } else if(arr2[i] < arr2[j]) {
252                 if (arr1[i] > arr1[j]) {
253                     s--;
254                 } else if(arr1[i] < arr1[j]) {
255                     s++;
256                 } else {
257                     m1++;
258                 }
259             } else {
260                 m2++;
261 
262                 if(arr1[i] == arr1[j]) {
263                     m1++;
264                 }
265             }
266         }
267     }
268 
269     nPair = len * (len - 1) / 2;
270 
271     if( m1 < nPair && m2 < nPair )
272       cor = s / ( sqrtf((float)(nPair-m1)) * sqrtf((float)(nPair-m2)) ) ;
273     else
274       cor = 0.0f ;
275 
276     return cor ;
277 }
278 
279 #if 0
280 
281 int main( int argc , char *argv[] )
282 {
283     float  a[100], b[100];
284     float  smallNCor, smallNCov, largeNCor, largeNCov;
285     int i;
286     int N ;
287 #define NBOT  10
288 #define NTOP  2000
289 #define NSTEP 10
290 
291     /* Test the small N version against a few values obtained from the old
292      * version in R.  Only exercising the small N version because the large
293      * N version requires the first array to be sorted and the second to be
294      * reordered in lockstep before it's called.*/
295     {
296         float  a1[] = {1,2,3,5,4};
297         float  a2[] = {1,2,3,3,5};
298         float  b1[] = {8,6,7,5,3,0,9};
299         float  b2[] = {3,1,4,1,5,9,2};
300         float  c1[] = {1,1,1,2,3,3,4,4};
301         float  c2[] = {1,2,1,3,3,5,5,5};
302 
303         printf("kSmall 5 = %g\n", kendallSmallN(a1, a2, 5) ) ;
304         printf("kSmall 7 = %g\n", kendallSmallN(b1, b2, 7) ) ;
305         printf("kSmall 8 = %g\n", kendallSmallN(c1, c2, 8) ) ;
306     }
307 
308     /* Now that we're confident that the simple, small N version works,
309      * extensively test it against the much more complex and bug-prone
310      * O(N log N) version.
311      */
312     for(i = 0; i < 100; i++) {
313         int j, len;
314         for(j = 0; j < 100; j++) {
315             a[j] = rand() % 30;
316             b[j] = rand() % 30;
317         }
318 
319         len = rand() % 50 + 50;
320 
321         /* The large N version assumes that the first array is sorted.  This
322          * will usually be made true in R before passing the arrays to the
323          * C functions.
324          */
325         insertionSort(a, len);
326 
327         smallNCor = kendallSmallN(a, b, len);
328         largeNCor = kendallNlogN (a, b, len);
329         printf("Cor: kSmall = %g  kNlogN = %g\n",smallNCor,largeNCor) ;
330     }
331 
332     printf("-- short tests done --\n") ;
333 
334     /* Speed test.  Compare the O(N^2) version, which is very similar to
335      * R's current impl, to my O(N log N) version.
336      */
337     {
338         float  *foo, *bar, *buf;
339         int i;
340         float  startTime, stopTime;
341 
342         foo = (float *) malloc(NTOP * sizeof(float));
343         bar = (float *) malloc(NTOP * sizeof(float));
344         buf = (float *) malloc(NTOP * sizeof(float));
345 
346      for( N=NBOT ; N <= NTOP ; N += NSTEP ){
347         for(i = 0; i < N; i++) {
348             foo[i] = rand();
349             bar[i] = rand();
350         }
351 
352         startTime = clock();
353         smallNCor = kendallSmallN(foo, bar, N);
354         stopTime = clock();
355         printf("N=%d: slow = %f ms  val = %g\n", N,stopTime - startTime,smallNCor);
356 
357         startTime = clock();
358 
359         /* Only sorting first array.  Normally the second one would be
360          * reordered in lockstep.
361          */
362 #if 1
363         qsort_floatfloat( N , foo , bar ) ;
364 #else
365         mergeSort(foo, buf, N);
366 #endif
367         largeNCor = kendallNlogN(foo, bar, N);
368         stopTime = clock();
369         printf("N=%d: fast = %f ms  val = %g\n", N,stopTime - startTime,largeNCor);
370      }
371     }
372 
373     return 0;
374 }
375 
376 #endif
377