1 /*
2 * ========================================================================
3 * $Id: rgb_operm.c 252 2006-10-10 13:17:36Z rgb $
4 *
5 * See copyright in copyright.h and the accompanying file COPYING
6 * ========================================================================
7 */
8
9 /*
10 * ========================================================================
11 * This is the revised Overlapping Permutations test. It directly
12 * simulates the covariance matrix of overlapping permutations. The way
13 * this works below (tentatively) is:
14 *
15 * For a bit ntuple of length N, slide a window of length N to the
16 * right one bit at a time. Compute the permutation index of the
17 * original ntuple, the permutation index of the window ntuple, and
18 * accumulate the covariance matrix of the two positions. This
19 * can be directly and precisely computed as well. The simulated
20 * result should be distributed according to the chisq distribution,
21 * so we subtract the two and feed it into the chisq program as a
22 * vector to compute p.
23 *
24 * This MAY NOT BE RIGHT. I'm working from both Marsaglia's limited
25 * documentation (in a program that doesn't do ANYTHING like what the
26 * documentation says it does) and from Nilpotent Markov Processes.
27 * But I confess to not quite understand how to actually perform the
28 * test in the latter -- it is very good at describing the construction
29 * of the target matrix, not so good at describing how to transform
30 * this into a chisq and p.
31 *
32 * FWIW, as I get something that actually works here, I'm going to
33 * THOROUGHLY document it in the book that will accompany the test.
34 *========================================================================
35 */
36
37 #include <dieharder/libdieharder.h>
38 #define RGB_OPERM_KMAX 10
39
40 /*
41 * Some globals that will eventually go in the test include where they
42 * arguably belong.
43 */
44 double fpipi(int pi1,int pi2,int nkp);
45 uint piperm(size_t *data,int len);
46 void make_cexact();
47 void make_cexpt();
48 int nperms,noperms;
49 double **cexact,**ceinv,**cexpt,**idty;
50 double *cvexact,*cvein,*cvexpt,*vidty;
51
rgb_operm(Test ** test,int irun)52 int rgb_operm(Test **test,int irun)
53 {
54
55 int i,j,n,nb,iv,s;
56 uint csamples; /* rgb_operm_k^2 is vector size of cov matrix */
57 uint *count,ctotal; /* counters */
58 uint size;
59 double pvalue,ntuple_prob,pbin; /* probabilities */
60 Vtest *vtest; /* Chisq entry vector */
61
62 gsl_matrix_view CEXACT,CEINV,CEXPT,IDTY;
63
64 /*
65 * For a given n = ntuple size in bits, there are n! bit orderings
66 */
67 MYDEBUG(D_RGB_OPERM){
68 printf("#==================================================================\n");
69 printf("# rgb_operm: Running rgb_operm verbosely for k = %d.\n",rgb_operm_k);
70 printf("# rgb_operm: Use -v = %d to focus.\n",D_RGB_OPERM);
71 printf("# rgb_operm: ======================================================\n");
72 }
73
74 /*
75 * Sanity check first
76 */
77 if((rgb_operm_k < 0) || (rgb_operm_k > RGB_OPERM_KMAX)){
78 printf("\nError: rgb_operm_k must be a positive integer <= %u. Exiting.\n",RGB_OPERM_KMAX);
79 exit(0);
80 }
81
82 nperms = gsl_sf_fact(rgb_operm_k);
83 noperms = gsl_sf_fact(3*rgb_operm_k-2);
84 csamples = rgb_operm_k*rgb_operm_k;
85 gsl_permutation * p = gsl_permutation_alloc(nperms);
86
87 /*
88 * Allocate memory for value_max vector of Vtest structs and counts,
89 * PER TEST. Note that we must free both of these when we are done
90 * or leak.
91 */
92 vtest = (Vtest *)malloc(csamples*sizeof(Vtest));
93 count = (uint *)malloc(csamples*sizeof(uint));
94 Vtest_create(vtest,csamples+1);
95
96 /*
97 * We have to allocate and free the cexact and cexpt matrices here
98 * or they'll be forgotten when these routines return.
99 */
100 MYDEBUG(D_RGB_OPERM){
101 printf("# rgb_operm: Creating and zeroing cexact[][] and cexpt[][].\n");
102 }
103 cexact = (double **)malloc(nperms*sizeof(double*));
104 ceinv = (double **)malloc(nperms*sizeof(double*));
105 cexpt = (double **)malloc(nperms*sizeof(double*));
106 idty = (double **)malloc(nperms*sizeof(double*));
107 cvexact = (double *)malloc(nperms*nperms*sizeof(double));
108 cvein = (double *)malloc(nperms*nperms*sizeof(double));
109 cvexpt = (double *)malloc(nperms*nperms*sizeof(double));
110 vidty = (double *)malloc(nperms*nperms*sizeof(double));
111 for(i=0;i<nperms;i++){
112 /* Here we pack addresses to map the matrix addressing onto the vector */
113 cexact[i] = &cvexact[i*nperms];
114 ceinv[i] = &cvein[i*nperms];
115 cexpt[i] = &cvexpt[i*nperms];
116 idty[i] = &vidty[i*nperms];
117 for(j = 0;j<nperms;j++){
118 cexact[i][j] = 0.0;
119 ceinv[i][j] = 0.0;
120 cexpt[i][j] = 0.0;
121 idty[i][j] = 0.0;
122 }
123 }
124
125 make_cexact();
126 make_cexpt();
127
128 iv=0;
129 for(i=0;i<nperms;i++){
130 for(j=0;j<nperms;j++){
131 cvexact[iv] = cexact[i][j];
132 cvexpt[iv] = cexpt[i][j];
133 vidty[iv] = 0.0;
134 }
135 }
136
137 CEXACT = gsl_matrix_view_array(cvexact, nperms, nperms);
138 CEINV = gsl_matrix_view_array(cvein , nperms, nperms);
139 CEXPT = gsl_matrix_view_array(cvexpt , nperms, nperms);
140 IDTY = gsl_matrix_view_array(vidty , nperms, nperms);
141
142 /*
143 * Hmmm, looks like cexact isn't invertible. Duh. So it has eigenvalues.
144 * This seems to be important (how, I do not know) so let's find out.
145 * Here is the gsl ritual for evaluating eigenvalues etc.
146 */
147
148 gsl_vector *eval = gsl_vector_alloc (nperms);
149 gsl_matrix *evec = gsl_matrix_alloc (nperms,nperms);
150 /*
151 gsl_eigen_nonsymm_workspace* w = gsl_eigen_nonsymmv_alloc(nperms);
152 gsl_eigen_nonsymm_params (1,0,w);
153 gsl_eigen_nonsymmv(&CEXACT.matrix, eval, evec, w);
154 gsl_eigen_nonsymmv_free (w);
155 */
156 gsl_eigen_symmv_workspace* w = gsl_eigen_symmv_alloc(nperms);
157 gsl_eigen_symmv(&CEXACT.matrix, eval, evec, w);
158 gsl_eigen_symmv_free (w);
159 gsl_eigen_symmv_sort (eval, evec, GSL_EIGEN_SORT_ABS_ASC);
160
161 {
162 int i;
163
164 printf("#==================================================================\n");
165 for (i = 0; i < nperms; i++) {
166 double eval_i = gsl_vector_get (eval, i);
167 gsl_vector_view evec_i = gsl_matrix_column (evec, i);
168 printf ("eigenvalue[%u] = %g\n", i, eval_i);
169 printf ("eigenvector[%u] = \n",i);
170 gsl_vector_fprintf (stdout,&evec_i.vector, "%10.5f");
171 }
172 printf("#==================================================================\n");
173 }
174
175 gsl_vector_free (eval);
176 gsl_matrix_free (evec);
177
178 /*
179 gsl_linalg_LU_decomp(&CEXACT.matrix, p, &s);
180 gsl_linalg_LU_invert(&CEXACT, p, &CEINV);
181 gsl_permutation_free(p);
182 gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, &CEINV.matrix, &CEXPT.matrix, 0.0, &IDTY.matrix);
183 printf("#==================================================================\n");
184 printf("# Should be inverse of C, assuming it is invertible:\n");
185 for(i=0;i<nperms;i++){
186 printf("# ");
187 for(j = 0;j<nperms;j++){
188 printf("%8.3f ",idty[i][j]);
189 }
190 printf("\n");
191 }
192 printf("#==================================================================\n");
193 printf("#==================================================================\n");
194 printf("# Should be normal on identity:\n");
195 for(i=0;i<nperms;i++){
196 printf("# ");
197 for(j = 0;j<nperms;j++){
198 printf("%8.3f ",idty[i][j]);
199 }
200 printf("\n");
201 }
202 printf("#==================================================================\n");
203 */
204
205
206
207 /*
208 * OK, at this point we have two matrices: cexact[][] is filled with
209 * the exact covariance matrix expected for the overlapping permutations.
210 * cexpt[][] has been filled numerically by generating strings of random
211 * uints or floats, generating sort index permutations, and
212 * using them to IDENTICALLY generate an "experimental" version of c[][].
213 * The two should correspond, in the limit of large tsamples. IF I
214 * understand Alhakim, Kawczak and Molchanov, then the way to implement
215 * the simplest possible chisq test is to evaluate:
216 * cexact^-1 cexpt \approx I
217 * where the diagonal terms should form a vector that is chisq distributed?
218 * Let's try this...
219 */
220
221
222
223 /*
224 * Free cexact[][] and cexpt[][]
225 * Fix this when we're done so we don't leak; for now to much trouble.
226 for(i=0;i<nperms;i++){
227 free(cexact[i]);
228 free(cexpt[i]);
229 }
230 free(cexact);
231 free(cexpt);
232 */
233
234 free(count);
235 return(0);
236
237 }
238
make_cexact()239 void make_cexact()
240 {
241
242 int i,j,k,ip,t,nop;
243 double fi,fj;
244 /*
245 * This is the test vector.
246 */
247 double testv[RGB_OPERM_KMAX*2]; /* easier than malloc etc, but beware length */
248 /*
249 * pi[] is the permutation index of a sample. ps[] holds the
250 * actual sample.
251 */
252 size_t pi[4096],ps[4096];
253 /*
254 * We seem to have made a mistake of sorts. We actually have to sum
255 * BOTH the forward AND the backward directions. That means that the
256 * permutation vector has to be of length 3k-1, with the pi=1 term
257 * corresponding to the middle. So for k=2, instead of 0,1,2 we need
258 * 0 1 2 3 4 and we'll have to do 23, 34 in the leading direction and
259 * 21, 10 in the trailing direction.
260 */
261 gsl_permutation **operms;
262
263 MYDEBUG(D_RGB_OPERM){
264 printf("#==================================================================\n");
265 printf("# rgb_operm: Running cexact()\n");
266 }
267
268 /*
269 * Test fpipi(). This is probably cruft, actually.
270 MYDEBUG(D_RGB_OPERM){
271 printf("# rgb_operm: Testing fpipi()\n");
272 for(i=0;i<nperms;i++){
273 for(j = 0;j<nperms;j++){
274 printf("# rgb_operm: fpipi(%u,%u,%u) = %f\n",i,j,nperms,fpipi(i,j,nperms));
275 }
276 }
277 }
278 */
279
280 MYDEBUG(D_RGB_OPERM){
281 printf("#==================================================================\n");
282 printf("# rgb_operm: Forming set of %u overlapping permutations\n",noperms);
283 printf("# rgb_operm: Permutations\n");
284 printf("# rgb_operm:==============================\n");
285 }
286 operms = (gsl_permutation**) malloc(noperms*sizeof(gsl_permutation*));
287 for(i=0;i<noperms;i++){
288 operms[i] = gsl_permutation_alloc(3*rgb_operm_k - 2);
289 /* Must quiet down
290 MYDEBUG(D_RGB_OPERM){
291 printf("# rgb_operm: ");
292 }
293 */
294 if(i == 0){
295 gsl_permutation_init(operms[i]);
296 } else {
297 gsl_permutation_memcpy(operms[i],operms[i-1]);
298 gsl_permutation_next(operms[i]);
299 }
300 /*
301 MYDEBUG(D_RGB_OPERM){
302 gsl_permutation_fprintf(stdout,operms[i]," %u");
303 printf("\n");
304 }
305 */
306 }
307
308 /*
309 * We now form c_exact PRECISELY the same way that we do c_expt[][]
310 * below, except that instead of pulling random samples of integers
311 * or floats and averaging over the permutations thus represented,
312 * we iterate over the complete set of equally weighted permutations
313 * to get an exact answer. Note that we have to center on 2k-1 and
314 * go both forwards and backwards.
315 */
316 for(t=0;t<noperms;t++){
317 /*
318 * To sort into a perm, test vector needs to be double.
319 */
320 for(k=0;k<3*rgb_operm_k - 2;k++) testv[k] = (double) operms[t]->data[k];
321
322 /* Not cruft, but quiet...
323 MYDEBUG(D_RGB_OPERM){
324 printf("#------------------------------------------------------------------\n");
325 printf("# Generating offset sample permutation pi's\n");
326 }
327 */
328 for(k=0;k<2*rgb_operm_k - 1;k++){
329 gsl_sort_index((size_t *) ps,&testv[k],1,rgb_operm_k);
330 pi[k] = piperm((size_t *) ps,rgb_operm_k);
331
332 /* Not cruft, but quiet...
333 MYDEBUG(D_RGB_OPERM){
334 printf("# %u: ",k);
335 for(ip=k;ip<rgb_operm_k+k;ip++){
336 printf("%.1f ",testv[ip]);
337 }
338 printf("\n# ");
339 for(ip=0;ip<rgb_operm_k;ip++){
340 printf("%u ",ps[ip]);
341 }
342 printf(" = %u\n",pi[k]);
343 }
344 */
345
346 }
347
348 /*
349 * This is the business end of things. The covariance matrix is the
350 * the sum of a central function of the permutation indices that yields
351 * nperms-1/nperms on diagonal, -1/nperms off diagonal, for all the
352 * possible permutations, for the FIRST permutation in a sample (fi)
353 * times the sum of the same function over all the overlapping permutations
354 * drawn from the same sample. Quite simple, really.
355 */
356 for(i=0;i<nperms;i++){
357 fi = fpipi(i,pi[rgb_operm_k-1],nperms);
358 for(j=0;j<nperms;j++){
359 fj = 0.0;
360 for(k=0;k<rgb_operm_k;k++){
361 fj += fpipi(j,pi[rgb_operm_k - 1 + k],nperms);
362 if(k != 0){
363 fj += fpipi(j,pi[rgb_operm_k - 1 - k],nperms);
364 }
365 }
366 cexact[i][j] += fi*fj;
367 }
368 }
369
370 }
371
372 MYDEBUG(D_RGB_OPERM){
373 printf("# rgb_operm:==============================\n");
374 printf("# rgb_operm: cexact[][] = \n");
375 }
376 for(i=0;i<nperms;i++){
377 MYDEBUG(D_RGB_OPERM){
378 printf("# ");
379 }
380 for(j=0;j<nperms;j++){
381 cexact[i][j] /= noperms;
382 MYDEBUG(D_RGB_OPERM){
383 printf("%10.6f ",cexact[i][j]);
384 }
385 }
386 MYDEBUG(D_RGB_OPERM){
387 printf("\n");
388 }
389 }
390
391 /*
392 * Free operms[]
393 */
394 for(i=0;i<noperms;i++){
395 gsl_permutation_free(operms[i]);
396 }
397 free(operms);
398
399 }
400
make_cexpt()401 void make_cexpt()
402 {
403
404 int i,j,k,ip,t;
405 double fi,fj;
406 /*
407 * This is the test vector.
408 */
409 double testv[RGB_OPERM_KMAX*2]; /* easier than malloc etc, but beware length */
410 /*
411 * pi[] is the permutation index of a sample. ps[] holds the
412 * actual sample.
413 */
414 int pi[4096],ps[4096];
415
416 MYDEBUG(D_RGB_OPERM){
417 printf("#==================================================================\n");
418 printf("# rgb_operm: Running cexpt()\n");
419 }
420
421 /*
422 * We evaluate cexpt[][] by sampling. In a nutshell, this involves
423 * a) Filling testv[] with 2*rgb_operm_k - 1 random uints or doubles
424 * It clearly cannot matter which we use, as long as the probability of
425 * exact duplicates in a sample is very low.
426 * b) Using gsl_sort_index the exact same way it was used in make_cexact()
427 * to generate the pi[] index, using ps[] as scratch space for the sort
428 * indices.
429 * c) Evaluating fi and fj from the SAMPLED result, tsamples times.
430 * d) Normalizing.
431 * Note that this is pretty much identical to the way we formed c_exact[][]
432 * except that we are determining the relative frequency of each sort order
433 * permutation 2*rgb_operm_k-1 long.
434 *
435 * NOTE WELL! I honestly think that it is borderline silly to view
436 * this as a matrix and to go through all of this nonsense. The theoretical
437 * c_exact[][] is computed from the observation that all the permutations
438 * of n objects have equal weight = 1/n!. Consequently, they should
439 * individually be binomially distributed, tending to normal with many
440 * samples. Collectively they should be distributed like a vector of
441 * equal binomial probabilities and a p-value should follow either from
442 * chisq on n!-1 DoF or for that matter a KS test. I see no way that
443 * making it into a matrix can increase the sensitivity of the test -- if
444 * the p-values are well defined in the two cases they can only be equal
445 * by their very definition.
446 *
447 * If you are a statistician reading these words and disagree, please
448 * communicate with me and explain why I'm wrong. I'm still very much
449 * learning statistics and would cherish gentle correction.
450 */
451 for(t=0;t<tsamples;t++){
452 /*
453 * To sort into a perm, test vector needs to be double.
454 */
455 for(k=0;k<3*rgb_operm_k - 2;k++) testv[k] = (double) gsl_rng_get(rng);
456
457 /* Not cruft, but quiet...
458 MYDEBUG(D_RGB_OPERM){
459 printf("#------------------------------------------------------------------\n");
460 printf("# Generating offset sample permutation pi's\n");
461 }
462 */
463 for(k=0;k<2*rgb_operm_k-1;k++){
464 gsl_sort_index(ps,&testv[k],1,rgb_operm_k);
465 pi[k] = piperm(ps,rgb_operm_k);
466
467 /* Not cruft, but quiet...
468 MYDEBUG(D_RGB_OPERM){
469 printf("# %u: ",k);
470 for(ip=k;ip<rgb_operm_k+k;ip++){
471 printf("%.1f ",testv[ip]);
472 }
473 printf("\n# ");
474 for(ip=0;ip<rgb_operm_k;ip++){
475 printf("%u ",permsample->data[ip]);
476 }
477 printf(" = %u\n",pi[k]);
478 }
479 */
480 }
481
482 /*
483 * This is the business end of things. The covariance matrix is the
484 * the sum of a central function of the permutation indices that yields
485 * nperms-1/nperms on diagonal, -1/nperms off diagonal, for all the
486 * possible permutations, for the FIRST permutation in a sample (fi)
487 * times the sum of the same function over all the overlapping permutations
488 * drawn from the same sample. Quite simple, really.
489 */
490 for(i=0;i<nperms;i++){
491 fi = fpipi(i,pi[rgb_operm_k-1],nperms);
492 for(j=0;j<nperms;j++){
493 fj = 0.0;
494 for(k=0;k<rgb_operm_k;k++){
495 fj += fpipi(j,pi[rgb_operm_k - 1 + k],nperms);
496 if(k != 0){
497 fj += fpipi(j,pi[rgb_operm_k - 1 - k],nperms);
498 }
499 }
500 cexpt[i][j] += fi*fj;
501 }
502 }
503
504 }
505
506 MYDEBUG(D_RGB_OPERM){
507 printf("# rgb_operm:==============================\n");
508 printf("# rgb_operm: cexpt[][] = \n");
509 }
510 for(i=0;i<nperms;i++){
511 MYDEBUG(D_RGB_OPERM){
512 printf("# ");
513 }
514 for(j=0;j<nperms;j++){
515 cexpt[i][j] /= tsamples;
516 MYDEBUG(D_RGB_OPERM){
517 printf("%10.6f ",cexpt[i][j]);
518 }
519 }
520 MYDEBUG(D_RGB_OPERM){
521 printf("\n");
522 }
523 }
524
525 }
526
piperm(size_t * data,int len)527 uint piperm(size_t *data,int len)
528 {
529
530 uint i,j,k,max,min;
531 uint pindex,uret,tmp;
532 static gsl_permutation** lookup = 0;
533
534 /*
535 * Allocate space for lookup table and fill it.
536 */
537 if(lookup == 0){
538 lookup = (gsl_permutation**) malloc(nperms*sizeof(gsl_permutation*));
539 MYDEBUG(D_RGB_OPERM){
540 printf("# rgb_operm: Allocating piperm lookup table of perms.\n");
541 }
542 for(i=0;i<nperms;i++){
543 lookup[i] = gsl_permutation_alloc(rgb_operm_k);
544 }
545 for(i=0;i<nperms;i++){
546 if(i == 0){
547 gsl_permutation_init(lookup[i]);
548 } else {
549 gsl_permutation_memcpy(lookup[i],lookup[i-1]);
550 gsl_permutation_next(lookup[i]);
551 }
552 }
553
554 /*
555 * This method yields a mirror symmetry in the permutations top to
556 * bottom.
557 for(i=0;i<nperms/2;i++){
558 if(i == 0){
559 gsl_permutation_init(lookup[i]);
560 for(j=0;j<rgb_operm_k;j++){
561 lookup[nperms-i-1]->data[rgb_operm_k-j-1] = lookup[i]->data[j];
562 }
563 } else {
564 gsl_permutation_memcpy(lookup[i],lookup[i-1]);
565 gsl_permutation_next(lookup[i]);
566 for(j=0;j<rgb_operm_k;j++){
567 lookup[nperms-i-1]->data[rgb_operm_k-j-1] = lookup[i]->data[j];
568 }
569 }
570 }
571 */
572 MYDEBUG(D_RGB_OPERM){
573 for(i=0;i<nperms;i++){
574 printf("# rgb_operm: %u => ",i);
575 gsl_permutation_fprintf(stdout,lookup[i]," %u");
576 printf("\n");
577 }
578 }
579
580 }
581
582 for(i=0;i<nperms;i++){
583 if(memcmp(data,lookup[i]->data,len*sizeof(uint))==0){
584 /* Not cruft, but off:
585 MYDEBUG(D_RGB_OPERM){
586 printf("# piperm(): ");
587 gsl_permutation_fprintf(stdout,lookup[i]," %u");
588 printf(" = %u\n",i);
589 }
590 */
591 return(i);
592 }
593 }
594 printf("We'd better not get here...\n");
595
596 return(0);
597
598 }
599
fpipi(int pi1,int pi2,int nkp)600 double fpipi(int pi1,int pi2,int nkp)
601 {
602
603 int i;
604 double fret;
605
606 /*
607 * compute the k-permutation index from iperm for the window
608 * at data[offset] of length len. If it matches pind, return
609 * the first quantity, otherwise return the second.
610 */
611 if(pi1 == pi2){
612
613 fret = (double) (nkp - 1.0)/nkp;
614 if(verbose < 0){
615 printf(" f(%d,%d) = %10.6f\n",pi1,pi2,fret);
616 }
617 return(fret);
618
619 } else {
620
621 fret = (double) (-1.0/nkp);
622 if(verbose < 0){
623 printf(" f(%d,%d) = %10.6f\n",pi1,pi2,fret);
624 }
625 return(fret);
626
627 }
628
629
630 }
631
632
633
634
635