1 /*-------------------------------------------------------------------------*/
2 /* This file contains code to calculate Kendall's Tau in O(N log N) time in
3 * a manner similar to the following reference:
4 *
5 * A Computer Method for Calculating Kendall's Tau with Ungrouped Data
6 * William R. Knight Journal of the American Statistical Association,
7 * Vol. 61, No. 314, Part 1 (Jun., 1966), pp. 436-439
8 *
9 * Copyright 2010 David Simcha
10 *
11 * License:
12 * Boost Software License - Version 1.0 - August 17th, 2003
13 *
14 * Permission is hereby granted, free of charge, to any person or organization
15 * obtaining a copy of the software and accompanying documentation covered by
16 * this license (the "Software") to use, reproduce, display, distribute,
17 * execute, and transmit the Software, and to prepare derivative works of the
18 * Software, and to permit third-parties to whom the Software is furnished to
19 * do so, all subject to the following:
20 *
21 * The copyright notices in the Software and this entire statement, including
22 * the above license grant, this restriction and the following disclaimer,
23 * must be included in all copies of the Software, in whole or in part, and
24 * all derivative works of the Software, unless such copies or derivative
25 * works are solely in the form of machine-executable object code generated by
26 * a source language processor.
27 *
28 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
29 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
30 * FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
31 * SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
32 * FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
33 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
34 * DEALINGS IN THE SOFTWARE.
35 *
36 */
37 /*-------------------------------------------------------------------------*/
38 /**---------------- Adapted for AFNI by RWCox - April 2010 ---------------**/
39 /*-------------------------------------------------------------------------*/
40
41 #include <stdlib.h>
42 #include <string.h>
43 #include <math.h>
44 #include <stdio.h>
45 #include <assert.h>
46 #include <time.h>
47
48 #ifdef SOLARIS_OLD /* 'if' might be overkill, but just to be minimal... */
49 /* 10 May 2010 [rickr] */
50 #include "machdep.h"
51 #else
52 #include <stdint.h>
53 #endif
54
55
56 /*-------------------------------------------------------------------------*/
57 /* Sorts in place, returns the bubble sort distance between the input array
58 * and the sorted array.
59 */
60
insertionSort(float * arr,int len)61 static int insertionSort(float *arr, int len)
62 {
63 int maxJ, i,j , swapCount = 0;
64
65 /* printf("enter insertionSort len=%d\n",len) ; */
66
67 if(len < 2) { return 0; }
68
69 maxJ = len - 1;
70 for(i = len - 2; i >= 0; --i) {
71 float val = arr[i];
72 for(j=i; j < maxJ && arr[j + 1] < val; ++j) {
73 arr[j] = arr[j + 1];
74 }
75
76 arr[j] = val;
77 swapCount += (j - i);
78 }
79
80 return swapCount;
81 }
82
83 /*-------------------------------------------------------------------------*/
84
merge(float * from,float * to,int middle,int len)85 static int merge(float *from, float *to, int middle, int len)
86 {
87 int bufIndex, leftLen, rightLen , swaps ;
88 float *left , *right;
89
90 /* printf("enter merge\n") ; */
91
92 bufIndex = 0;
93 swaps = 0;
94
95 left = from;
96 right = from + middle;
97 rightLen = len - middle;
98 leftLen = middle;
99
100 while(leftLen && rightLen) {
101 if(right[0] < left[0]) {
102 to[bufIndex] = right[0];
103 swaps += leftLen;
104 rightLen--;
105 right++;
106 } else {
107 to[bufIndex] = left[0];
108 leftLen--;
109 left++;
110 }
111 bufIndex++;
112 }
113
114 if(leftLen) {
115 #pragma omp critical (MEMCPY)
116 memcpy(to + bufIndex, left, leftLen * sizeof(float));
117 } else if(rightLen) {
118 #pragma omp critical (MEMCPY)
119 memcpy(to + bufIndex, right, rightLen * sizeof(float));
120 }
121
122 return swaps;
123 }
124
125 /*-------------------------------------------------------------------------*/
126 /* Sorts in place, returns the bubble sort distance between the input array
127 * and the sorted array.
128 */
129
mergeSort(float * x,float * buf,int len)130 static int mergeSort(float *x, float *buf, int len)
131 {
132 int swaps , half ;
133
134 /* printf("enter mergeSort\n") ; */
135
136 if(len < 10) {
137 return insertionSort(x, len);
138 }
139
140 swaps = 0;
141
142 if(len < 2) { return 0; }
143
144 half = len / 2;
145 swaps += mergeSort(x, buf, half);
146 swaps += mergeSort(x + half, buf + half, len - half);
147 swaps += merge(x, buf, half, len);
148
149 #pragma omp critical (MEMCPY)
150 memcpy(x, buf, len * sizeof(float));
151 return swaps;
152 }
153
154 /*-------------------------------------------------------------------------*/
155
getMs(float * data,int len)156 static int getMs(float *data, int len) /* Assumes data is sorted */
157 {
158 int Ms = 0, tieCount = 0 , i ;
159
160 /* printf("enter getMs\n") ; */
161
162 for(i = 1; i < len; i++) {
163 if(data[i] == data[i-1]) {
164 tieCount++;
165 } else if(tieCount) {
166 Ms += (tieCount * (tieCount + 1)) / 2;
167 tieCount = 0;
168 }
169 }
170 if(tieCount) {
171 Ms += (tieCount * (tieCount + 1)) / 2;
172 }
173 return Ms;
174 }
175
176 /*-------------------------------------------------------------------------*/
177 /* This function calculates the Kendall correlation tau_b.
178 * The arrays arr1 should be sorted before this call, and arr2 should be
179 * re-ordered in lockstep. This can be done by calling
180 * qsort_floatfloat(len,arr1,arr2)
181 * for example.
182 * Note also that arr1 and arr2 will be modified, so if they need to
183 * be preserved, do so before calling this function.
184 */
185
kendallNlogN(float * arr1,float * arr2,int len)186 float kendallNlogN( float *arr1, float *arr2, int len )
187 {
188 int m1 = 0, m2 = 0, tieCount, swapCount, nPair, s,i ;
189 float cor ;
190
191 /* printf("enter kendallNlogN\n") ; */
192
193 if( len < 2 ) return (float)0 ;
194
195 nPair = len * (len - 1) / 2;
196 s = nPair;
197
198 tieCount = 0;
199 for(i = 1; i < len; i++) {
200 if(arr1[i - 1] == arr1[i]) {
201 tieCount++;
202 } else if(tieCount > 0) {
203 insertionSort(arr2 + i - tieCount - 1, tieCount + 1);
204 m1 += tieCount * (tieCount + 1) / 2;
205 s += getMs(arr2 + i - tieCount - 1, tieCount + 1);
206 tieCount = 0;
207 }
208 }
209 if(tieCount > 0) {
210 insertionSort(arr2 + i - tieCount - 1, tieCount + 1);
211 m1 += tieCount * (tieCount + 1) / 2;
212 s += getMs(arr2 + i - tieCount - 1, tieCount + 1);
213 }
214
215 swapCount = mergeSort(arr2, arr1, len);
216
217 m2 = getMs(arr2, len);
218 s -= (m1 + m2) + 2 * swapCount;
219
220 if( m1 < nPair && m2 < nPair )
221 cor = s / ( sqrtf((float)(nPair-m1)) * sqrtf((float)(nPair-m2)) ) ;
222 else
223 cor = 0.0f ;
224
225 return cor ;
226 }
227
228 /*-------------------------------------------------------------------------*/
229 /* This function uses a simple O(N^2) implementation. It probably has a
230 * smaller constant and therefore is useful in the small N case, and is also
231 * useful for testing the relatively complex O(N log N) implementation.
232 */
233
kendallSmallN(float * arr1,float * arr2,int len)234 float kendallSmallN( float *arr1, float *arr2, int len )
235 {
236 int m1 = 0, m2 = 0, s = 0, nPair , i,j ;
237 float cor ;
238
239 /* printf("enter kendallSmallN\n") ; */
240
241 for(i = 0; i < len; i++) {
242 for(j = i + 1; j < len; j++) {
243 if(arr2[i] > arr2[j]) {
244 if (arr1[i] > arr1[j]) {
245 s++;
246 } else if(arr1[i] < arr1[j]) {
247 s--;
248 } else {
249 m1++;
250 }
251 } else if(arr2[i] < arr2[j]) {
252 if (arr1[i] > arr1[j]) {
253 s--;
254 } else if(arr1[i] < arr1[j]) {
255 s++;
256 } else {
257 m1++;
258 }
259 } else {
260 m2++;
261
262 if(arr1[i] == arr1[j]) {
263 m1++;
264 }
265 }
266 }
267 }
268
269 nPair = len * (len - 1) / 2;
270
271 if( m1 < nPair && m2 < nPair )
272 cor = s / ( sqrtf((float)(nPair-m1)) * sqrtf((float)(nPair-m2)) ) ;
273 else
274 cor = 0.0f ;
275
276 return cor ;
277 }
278
279 #if 0
280
281 int main( int argc , char *argv[] )
282 {
283 float a[100], b[100];
284 float smallNCor, smallNCov, largeNCor, largeNCov;
285 int i;
286 int N ;
287 #define NBOT 10
288 #define NTOP 2000
289 #define NSTEP 10
290
291 /* Test the small N version against a few values obtained from the old
292 * version in R. Only exercising the small N version because the large
293 * N version requires the first array to be sorted and the second to be
294 * reordered in lockstep before it's called.*/
295 {
296 float a1[] = {1,2,3,5,4};
297 float a2[] = {1,2,3,3,5};
298 float b1[] = {8,6,7,5,3,0,9};
299 float b2[] = {3,1,4,1,5,9,2};
300 float c1[] = {1,1,1,2,3,3,4,4};
301 float c2[] = {1,2,1,3,3,5,5,5};
302
303 printf("kSmall 5 = %g\n", kendallSmallN(a1, a2, 5) ) ;
304 printf("kSmall 7 = %g\n", kendallSmallN(b1, b2, 7) ) ;
305 printf("kSmall 8 = %g\n", kendallSmallN(c1, c2, 8) ) ;
306 }
307
308 /* Now that we're confident that the simple, small N version works,
309 * extensively test it against the much more complex and bug-prone
310 * O(N log N) version.
311 */
312 for(i = 0; i < 100; i++) {
313 int j, len;
314 for(j = 0; j < 100; j++) {
315 a[j] = rand() % 30;
316 b[j] = rand() % 30;
317 }
318
319 len = rand() % 50 + 50;
320
321 /* The large N version assumes that the first array is sorted. This
322 * will usually be made true in R before passing the arrays to the
323 * C functions.
324 */
325 insertionSort(a, len);
326
327 smallNCor = kendallSmallN(a, b, len);
328 largeNCor = kendallNlogN (a, b, len);
329 printf("Cor: kSmall = %g kNlogN = %g\n",smallNCor,largeNCor) ;
330 }
331
332 printf("-- short tests done --\n") ;
333
334 /* Speed test. Compare the O(N^2) version, which is very similar to
335 * R's current impl, to my O(N log N) version.
336 */
337 {
338 float *foo, *bar, *buf;
339 int i;
340 float startTime, stopTime;
341
342 foo = (float *) malloc(NTOP * sizeof(float));
343 bar = (float *) malloc(NTOP * sizeof(float));
344 buf = (float *) malloc(NTOP * sizeof(float));
345
346 for( N=NBOT ; N <= NTOP ; N += NSTEP ){
347 for(i = 0; i < N; i++) {
348 foo[i] = rand();
349 bar[i] = rand();
350 }
351
352 startTime = clock();
353 smallNCor = kendallSmallN(foo, bar, N);
354 stopTime = clock();
355 printf("N=%d: slow = %f ms val = %g\n", N,stopTime - startTime,smallNCor);
356
357 startTime = clock();
358
359 /* Only sorting first array. Normally the second one would be
360 * reordered in lockstep.
361 */
362 #if 1
363 qsort_floatfloat( N , foo , bar ) ;
364 #else
365 mergeSort(foo, buf, N);
366 #endif
367 largeNCor = kendallNlogN(foo, bar, N);
368 stopTime = clock();
369 printf("N=%d: fast = %f ms val = %g\n", N,stopTime - startTime,largeNCor);
370 }
371 }
372
373 return 0;
374 }
375
376 #endif
377