1 /*
2  * ========================================================================
3  * $Id: rgb_operm.c 252 2006-10-10 13:17:36Z rgb $
4  *
5  * See copyright in copyright.h and the accompanying file COPYING
6  * ========================================================================
7  */
8 
9 /*
10  * ========================================================================
11  * This is the revised Overlapping Permutations test.  It directly
12  * simulates the covariance matrix of overlapping permutations.  The way
13  * this works below (tentatively) is:
14  *
15  *    For a bit ntuple of length N, slide a window of length N to the
16  *    right one bit at a time.  Compute the permutation index of the
17  *    original ntuple, the permutation index of the window ntuple, and
18  *    accumulate the covariance matrix of the two positions.  This
19  *    can be directly and precisely computed as well.  The simulated
20  *    result should be distributed according to the chisq distribution,
21  *    so we subtract the two and feed it into the chisq program as a
22  *    vector to compute p.
23  *
24  * This MAY NOT BE RIGHT.  I'm working from both Marsaglia's limited
25  * documentation (in a program that doesn't do ANYTHING like what the
26  * documentation says it does) and from Nilpotent Markov Processes.
27  * But I confess to not quite understand how to actually perform the
28  * test in the latter -- it is very good at describing the construction
29  * of the target matrix, not so good at describing how to transform
30  * this into a chisq and p.
31  *
32  * FWIW, as I get something that actually works here, I'm going to
33  * THOROUGHLY document it in the book that will accompany the test.
34  *========================================================================
35  */
36 
37 #include <dieharder/libdieharder.h>
38 #define RGB_OPERM_KMAX 10
39 
40 /*
41  * Some globals that will eventually go in the test include where they
42  * arguably belong.
43  */
44 double fpipi(int pi1,int pi2,int nkp);
45 uint piperm(size_t *data,int len);
46 void make_cexact();
47 void make_cexpt();
48 int nperms,noperms;
49 double **cexact,**ceinv,**cexpt,**idty;
50 double *cvexact,*cvein,*cvexpt,*vidty;
51 
rgb_operm(Test ** test,int irun)52 int rgb_operm(Test **test,int irun)
53 {
54 
55  int i,j,n,nb,iv,s;
56  uint csamples;   /* rgb_operm_k^2 is vector size of cov matrix */
57  uint *count,ctotal; /* counters */
58  uint size;
59  double pvalue,ntuple_prob,pbin;  /* probabilities */
60  Vtest *vtest;   /* Chisq entry vector */
61 
62  gsl_matrix_view CEXACT,CEINV,CEXPT,IDTY;
63 
64  /*
65   * For a given n = ntuple size in bits, there are n! bit orderings
66   */
67  MYDEBUG(D_RGB_OPERM){
68    printf("#==================================================================\n");
69    printf("# rgb_operm: Running rgb_operm verbosely for k = %d.\n",rgb_operm_k);
70    printf("# rgb_operm: Use -v = %d to focus.\n",D_RGB_OPERM);
71    printf("# rgb_operm: ======================================================\n");
72  }
73 
74  /*
75   * Sanity check first
76   */
77  if((rgb_operm_k < 0) || (rgb_operm_k > RGB_OPERM_KMAX)){
78    printf("\nError:  rgb_operm_k must be a positive integer <= %u.  Exiting.\n",RGB_OPERM_KMAX);
79    exit(0);
80  }
81 
82  nperms = gsl_sf_fact(rgb_operm_k);
83  noperms = gsl_sf_fact(3*rgb_operm_k-2);
84  csamples = rgb_operm_k*rgb_operm_k;
85  gsl_permutation * p = gsl_permutation_alloc(nperms);
86 
87  /*
88   * Allocate memory for value_max vector of Vtest structs and counts,
89   * PER TEST.  Note that we must free both of these when we are done
90   * or leak.
91   */
92  vtest = (Vtest *)malloc(csamples*sizeof(Vtest));
93  count = (uint *)malloc(csamples*sizeof(uint));
94  Vtest_create(vtest,csamples+1);
95 
96  /*
97   * We have to allocate and free the cexact and cexpt matrices here
98   * or they'll be forgotten when these routines return.
99   */
100  MYDEBUG(D_RGB_OPERM){
101    printf("# rgb_operm: Creating and zeroing cexact[][] and cexpt[][].\n");
102  }
103  cexact = (double **)malloc(nperms*sizeof(double*));
104  ceinv  = (double **)malloc(nperms*sizeof(double*));
105  cexpt  = (double **)malloc(nperms*sizeof(double*));
106  idty   = (double **)malloc(nperms*sizeof(double*));
107  cvexact = (double *)malloc(nperms*nperms*sizeof(double));
108  cvein   = (double *)malloc(nperms*nperms*sizeof(double));
109  cvexpt  = (double *)malloc(nperms*nperms*sizeof(double));
110  vidty   = (double *)malloc(nperms*nperms*sizeof(double));
111  for(i=0;i<nperms;i++){
112    /* Here we pack addresses to map the matrix addressing onto the vector */
113    cexact[i] = &cvexact[i*nperms];
114    ceinv[i] = &cvein[i*nperms];
115    cexpt[i] = &cvexpt[i*nperms];
116    idty[i] = &vidty[i*nperms];
117    for(j = 0;j<nperms;j++){
118      cexact[i][j] = 0.0;
119      ceinv[i][j] = 0.0;
120      cexpt[i][j]  = 0.0;
121      idty[i][j]   = 0.0;
122    }
123  }
124 
125  make_cexact();
126  make_cexpt();
127 
128  iv=0;
129  for(i=0;i<nperms;i++){
130    for(j=0;j<nperms;j++){
131      cvexact[iv] = cexact[i][j];
132      cvexpt[iv]  = cexpt[i][j];
133      vidty[iv]   = 0.0;
134    }
135  }
136 
137  CEXACT = gsl_matrix_view_array(cvexact, nperms, nperms);
138  CEINV  = gsl_matrix_view_array(cvein  , nperms, nperms);
139  CEXPT  = gsl_matrix_view_array(cvexpt , nperms, nperms);
140  IDTY   = gsl_matrix_view_array(vidty  , nperms, nperms);
141 
142  /*
143   * Hmmm, looks like cexact isn't invertible.  Duh.  So it has eigenvalues.
144   * This seems to be important (how, I do not know) so let's find out.
145   * Here is the gsl ritual for evaluating eigenvalues etc.
146   */
147 
148  gsl_vector *eval = gsl_vector_alloc (nperms);
149  gsl_matrix *evec = gsl_matrix_alloc (nperms,nperms);
150  /*
151  gsl_eigen_nonsymm_workspace* w =  gsl_eigen_nonsymmv_alloc(nperms);
152  gsl_eigen_nonsymm_params (1,0,w);
153  gsl_eigen_nonsymmv(&CEXACT.matrix, eval, evec, w);
154  gsl_eigen_nonsymmv_free (w);
155  */
156  gsl_eigen_symmv_workspace* w =  gsl_eigen_symmv_alloc(nperms);
157  gsl_eigen_symmv(&CEXACT.matrix, eval, evec, w);
158  gsl_eigen_symmv_free (w);
159  gsl_eigen_symmv_sort (eval, evec, GSL_EIGEN_SORT_ABS_ASC);
160 
161  {
162    int i;
163 
164    printf("#==================================================================\n");
165    for (i = 0; i < nperms; i++) {
166      double eval_i = gsl_vector_get (eval, i);
167      gsl_vector_view evec_i = gsl_matrix_column (evec, i);
168      printf ("eigenvalue[%u] = %g\n", i, eval_i);
169      printf ("eigenvector[%u] = \n",i);
170      gsl_vector_fprintf (stdout,&evec_i.vector, "%10.5f");
171    }
172    printf("#==================================================================\n");
173  }
174 
175  gsl_vector_free (eval);
176  gsl_matrix_free (evec);
177 
178 /*
179  gsl_linalg_LU_decomp(&CEXACT.matrix, p, &s);
180  gsl_linalg_LU_invert(&CEXACT, p, &CEINV);
181  gsl_permutation_free(p);
182  gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, &CEINV.matrix, &CEXPT.matrix, 0.0, &IDTY.matrix);
183  printf("#==================================================================\n");
184  printf("# Should be inverse of C, assuming it is invertible:\n");
185  for(i=0;i<nperms;i++){
186    printf("# ");
187    for(j = 0;j<nperms;j++){
188      printf("%8.3f ",idty[i][j]);
189    }
190    printf("\n");
191  }
192  printf("#==================================================================\n");
193  printf("#==================================================================\n");
194  printf("# Should be normal on identity:\n");
195  for(i=0;i<nperms;i++){
196    printf("# ");
197    for(j = 0;j<nperms;j++){
198      printf("%8.3f ",idty[i][j]);
199    }
200    printf("\n");
201  }
202  printf("#==================================================================\n");
203  */
204 
205 
206 
207  /*
208   * OK, at this point we have two matrices:  cexact[][] is filled with
209   * the exact covariance matrix expected for the overlapping permutations.
210   * cexpt[][] has been filled numerically by generating strings of random
211   * uints or floats, generating sort index permutations, and
212   * using them to IDENTICALLY generate an "experimental" version of c[][].
213   * The two should correspond, in the limit of large tsamples.  IF I
214   * understand Alhakim, Kawczak and Molchanov, then the way to implement
215   * the simplest possible chisq test is to evaluate:
216   *       cexact^-1 cexpt \approx I
217   * where the diagonal terms should form a vector that is chisq distributed?
218   * Let's try this...
219   */
220 
221 
222 
223  /*
224   * Free cexact[][] and cexpt[][]
225   * Fix this when we're done so we don't leak; for now to much trouble.
226  for(i=0;i<nperms;i++){
227    free(cexact[i]);
228    free(cexpt[i]);
229  }
230  free(cexact);
231  free(cexpt);
232   */
233 
234  free(count);
235  return(0);
236 
237 }
238 
make_cexact()239 void make_cexact()
240 {
241 
242  int i,j,k,ip,t,nop;
243  double fi,fj;
244  /*
245   * This is the test vector.
246   */
247  double testv[RGB_OPERM_KMAX*2];  /* easier than malloc etc, but beware length */
248  /*
249   * pi[] is the permutation index of a sample.  ps[] holds the
250   * actual sample.
251   */
252  size_t pi[4096],ps[4096];
253  /*
254   * We seem to have made a mistake of sorts.  We actually have to sum
255   * BOTH the forward AND the backward directions.  That means that the
256   * permutation vector has to be of length 3k-1, with the pi=1 term
257   * corresponding to the middle.  So for k=2, instead of 0,1,2 we need
258   * 0 1 2 3 4 and we'll have to do 23, 34 in the leading direction and
259   * 21, 10 in the trailing direction.
260   */
261  gsl_permutation **operms;
262 
263  MYDEBUG(D_RGB_OPERM){
264    printf("#==================================================================\n");
265    printf("# rgb_operm: Running cexact()\n");
266  }
267 
268  /*
269   * Test fpipi().  This is probably cruft, actually.
270  MYDEBUG(D_RGB_OPERM){
271    printf("# rgb_operm: Testing fpipi()\n");
272    for(i=0;i<nperms;i++){
273      for(j = 0;j<nperms;j++){
274        printf("# rgb_operm: fpipi(%u,%u,%u) = %f\n",i,j,nperms,fpipi(i,j,nperms));
275      }
276    }
277  }
278  */
279 
280  MYDEBUG(D_RGB_OPERM){
281    printf("#==================================================================\n");
282    printf("# rgb_operm: Forming set of %u overlapping permutations\n",noperms);
283    printf("# rgb_operm: Permutations\n");
284    printf("# rgb_operm:==============================\n");
285  }
286  operms = (gsl_permutation**) malloc(noperms*sizeof(gsl_permutation*));
287  for(i=0;i<noperms;i++){
288    operms[i] = gsl_permutation_alloc(3*rgb_operm_k - 2);
289    /* Must quiet down
290    MYDEBUG(D_RGB_OPERM){
291      printf("# rgb_operm: ");
292    }
293    */
294    if(i == 0){
295      gsl_permutation_init(operms[i]);
296    } else {
297      gsl_permutation_memcpy(operms[i],operms[i-1]);
298      gsl_permutation_next(operms[i]);
299    }
300    /*
301    MYDEBUG(D_RGB_OPERM){
302      gsl_permutation_fprintf(stdout,operms[i]," %u");
303      printf("\n");
304    }
305    */
306  }
307 
308  /*
309   * We now form c_exact PRECISELY the same way that we do c_expt[][]
310   * below, except that instead of pulling random samples of integers
311   * or floats and averaging over the permutations thus represented,
312   * we iterate over the complete set of equally weighted permutations
313   * to get an exact answer.  Note that we have to center on 2k-1 and
314   * go both forwards and backwards.
315   */
316  for(t=0;t<noperms;t++){
317    /*
318     * To sort into a perm, test vector needs to be double.
319     */
320    for(k=0;k<3*rgb_operm_k - 2;k++) testv[k] = (double) operms[t]->data[k];
321 
322    /* Not cruft, but quiet...
323    MYDEBUG(D_RGB_OPERM){
324      printf("#------------------------------------------------------------------\n");
325      printf("# Generating offset sample permutation pi's\n");
326    }
327    */
328    for(k=0;k<2*rgb_operm_k - 1;k++){
329      gsl_sort_index((size_t *) ps,&testv[k],1,rgb_operm_k);
330      pi[k] = piperm((size_t *) ps,rgb_operm_k);
331 
332      /* Not cruft, but quiet...
333      MYDEBUG(D_RGB_OPERM){
334        printf("# %u: ",k);
335        for(ip=k;ip<rgb_operm_k+k;ip++){
336          printf("%.1f ",testv[ip]);
337        }
338        printf("\n# ");
339        for(ip=0;ip<rgb_operm_k;ip++){
340          printf("%u ",ps[ip]);
341        }
342        printf(" = %u\n",pi[k]);
343      }
344      */
345 
346    }
347 
348    /*
349     * This is the business end of things.  The covariance matrix is the
350     * the sum of a central function of the permutation indices that yields
351     * nperms-1/nperms on diagonal, -1/nperms off diagonal, for all the
352     * possible permutations, for the FIRST permutation in a sample (fi)
353     * times the sum of the same function over all the overlapping permutations
354     * drawn from the same sample.  Quite simple, really.
355     */
356    for(i=0;i<nperms;i++){
357      fi = fpipi(i,pi[rgb_operm_k-1],nperms);
358      for(j=0;j<nperms;j++){
359        fj = 0.0;
360        for(k=0;k<rgb_operm_k;k++){
361          fj += fpipi(j,pi[rgb_operm_k - 1 + k],nperms);
362          if(k != 0){
363            fj += fpipi(j,pi[rgb_operm_k - 1 - k],nperms);
364 	 }
365        }
366        cexact[i][j] += fi*fj;
367      }
368    }
369 
370  }
371 
372  MYDEBUG(D_RGB_OPERM){
373    printf("# rgb_operm:==============================\n");
374    printf("# rgb_operm: cexact[][] = \n");
375  }
376  for(i=0;i<nperms;i++){
377    MYDEBUG(D_RGB_OPERM){
378      printf("# ");
379    }
380    for(j=0;j<nperms;j++){
381      cexact[i][j] /= noperms;
382      MYDEBUG(D_RGB_OPERM){
383        printf("%10.6f  ",cexact[i][j]);
384      }
385    }
386    MYDEBUG(D_RGB_OPERM){
387      printf("\n");
388    }
389  }
390 
391  /*
392   * Free operms[]
393   */
394  for(i=0;i<noperms;i++){
395    gsl_permutation_free(operms[i]);
396  }
397  free(operms);
398 
399 }
400 
make_cexpt()401 void make_cexpt()
402 {
403 
404  int i,j,k,ip,t;
405  double fi,fj;
406  /*
407   * This is the test vector.
408   */
409  double testv[RGB_OPERM_KMAX*2];  /* easier than malloc etc, but beware length */
410  /*
411   * pi[] is the permutation index of a sample.  ps[] holds the
412   * actual sample.
413   */
414  int pi[4096],ps[4096];
415 
416  MYDEBUG(D_RGB_OPERM){
417    printf("#==================================================================\n");
418    printf("# rgb_operm: Running cexpt()\n");
419  }
420 
421  /*
422   * We evaluate cexpt[][] by sampling.  In a nutshell, this involves
423   *   a) Filling testv[] with 2*rgb_operm_k - 1 random uints or doubles
424   * It clearly cannot matter which we use, as long as the probability of
425   * exact duplicates in a sample is very low.
426   *   b) Using gsl_sort_index the exact same way it was used in make_cexact()
427   * to generate the pi[] index, using ps[] as scratch space for the sort
428   * indices.
429   *   c) Evaluating fi and fj from the SAMPLED result, tsamples times.
430   *   d) Normalizing.
431   * Note that this is pretty much identical to the way we formed c_exact[][]
432   * except that we are determining the relative frequency of each sort order
433   * permutation 2*rgb_operm_k-1 long.
434   *
435   * NOTE WELL!  I honestly think that it is borderline silly to view
436   * this as a matrix and to go through all of this nonsense.  The theoretical
437   * c_exact[][] is computed from the observation that all the permutations
438   * of n objects have equal weight = 1/n!.  Consequently, they should
439   * individually be binomially distributed, tending to normal with many
440   * samples.  Collectively they should be distributed like a vector of
441   * equal binomial probabilities and a p-value should follow either from
442   * chisq on n!-1 DoF or for that matter a KS test.  I see no way that
443   * making it into a matrix can increase the sensitivity of the test -- if
444   * the p-values are well defined in the two cases they can only be equal
445   * by their very definition.
446   *
447   * If you are a statistician reading these words and disagree, please
448   * communicate with me and explain why I'm wrong.  I'm still very much
449   * learning statistics and would cherish gentle correction.
450   */
451  for(t=0;t<tsamples;t++){
452    /*
453     * To sort into a perm, test vector needs to be double.
454     */
455    for(k=0;k<3*rgb_operm_k - 2;k++) testv[k] = (double) gsl_rng_get(rng);
456 
457    /* Not cruft, but quiet...
458    MYDEBUG(D_RGB_OPERM){
459      printf("#------------------------------------------------------------------\n");
460      printf("# Generating offset sample permutation pi's\n");
461    }
462    */
463    for(k=0;k<2*rgb_operm_k-1;k++){
464      gsl_sort_index(ps,&testv[k],1,rgb_operm_k);
465      pi[k] = piperm(ps,rgb_operm_k);
466 
467      /* Not cruft, but quiet...
468      MYDEBUG(D_RGB_OPERM){
469        printf("# %u: ",k);
470        for(ip=k;ip<rgb_operm_k+k;ip++){
471          printf("%.1f ",testv[ip]);
472        }
473        printf("\n# ");
474        for(ip=0;ip<rgb_operm_k;ip++){
475          printf("%u ",permsample->data[ip]);
476        }
477        printf(" = %u\n",pi[k]);
478      }
479      */
480    }
481 
482    /*
483     * This is the business end of things.  The covariance matrix is the
484     * the sum of a central function of the permutation indices that yields
485     * nperms-1/nperms on diagonal, -1/nperms off diagonal, for all the
486     * possible permutations, for the FIRST permutation in a sample (fi)
487     * times the sum of the same function over all the overlapping permutations
488     * drawn from the same sample.  Quite simple, really.
489     */
490    for(i=0;i<nperms;i++){
491      fi = fpipi(i,pi[rgb_operm_k-1],nperms);
492      for(j=0;j<nperms;j++){
493        fj = 0.0;
494        for(k=0;k<rgb_operm_k;k++){
495          fj += fpipi(j,pi[rgb_operm_k - 1 + k],nperms);
496 	 if(k != 0){
497            fj += fpipi(j,pi[rgb_operm_k - 1 - k],nperms);
498 	 }
499        }
500        cexpt[i][j] += fi*fj;
501      }
502    }
503 
504  }
505 
506  MYDEBUG(D_RGB_OPERM){
507    printf("# rgb_operm:==============================\n");
508    printf("# rgb_operm: cexpt[][] = \n");
509  }
510  for(i=0;i<nperms;i++){
511    MYDEBUG(D_RGB_OPERM){
512      printf("# ");
513    }
514    for(j=0;j<nperms;j++){
515      cexpt[i][j] /= tsamples;
516      MYDEBUG(D_RGB_OPERM){
517        printf("%10.6f  ",cexpt[i][j]);
518      }
519    }
520    MYDEBUG(D_RGB_OPERM){
521      printf("\n");
522    }
523  }
524 
525 }
526 
piperm(size_t * data,int len)527 uint piperm(size_t *data,int len)
528 {
529 
530  uint i,j,k,max,min;
531  uint pindex,uret,tmp;
532  static gsl_permutation** lookup = 0;
533 
534  /*
535   * Allocate space for lookup table and fill it.
536   */
537  if(lookup == 0){
538    lookup = (gsl_permutation**) malloc(nperms*sizeof(gsl_permutation*));
539    MYDEBUG(D_RGB_OPERM){
540      printf("# rgb_operm: Allocating piperm lookup table of perms.\n");
541    }
542    for(i=0;i<nperms;i++){
543         lookup[i] = gsl_permutation_alloc(rgb_operm_k);
544    }
545    for(i=0;i<nperms;i++){
546      if(i == 0){
547        gsl_permutation_init(lookup[i]);
548      } else {
549        gsl_permutation_memcpy(lookup[i],lookup[i-1]);
550        gsl_permutation_next(lookup[i]);
551      }
552    }
553 
554    /*
555     * This method yields a mirror symmetry in the permutations top to
556     * bottom.
557    for(i=0;i<nperms/2;i++){
558      if(i == 0){
559        gsl_permutation_init(lookup[i]);
560        for(j=0;j<rgb_operm_k;j++){
561          lookup[nperms-i-1]->data[rgb_operm_k-j-1] = lookup[i]->data[j];
562        }
563      } else {
564        gsl_permutation_memcpy(lookup[i],lookup[i-1]);
565        gsl_permutation_next(lookup[i]);
566        for(j=0;j<rgb_operm_k;j++){
567          lookup[nperms-i-1]->data[rgb_operm_k-j-1] = lookup[i]->data[j];
568        }
569      }
570    }
571    */
572    MYDEBUG(D_RGB_OPERM){
573      for(i=0;i<nperms;i++){
574        printf("# rgb_operm: %u => ",i);
575        gsl_permutation_fprintf(stdout,lookup[i]," %u");
576        printf("\n");
577      }
578    }
579 
580  }
581 
582  for(i=0;i<nperms;i++){
583    if(memcmp(data,lookup[i]->data,len*sizeof(uint))==0){
584      /* Not cruft, but off:
585      MYDEBUG(D_RGB_OPERM){
586        printf("# piperm(): ");
587        gsl_permutation_fprintf(stdout,lookup[i]," %u");
588        printf(" = %u\n",i);
589      }
590      */
591      return(i);
592    }
593  }
594  printf("We'd better not get here...\n");
595 
596  return(0);
597 
598 }
599 
fpipi(int pi1,int pi2,int nkp)600 double fpipi(int pi1,int pi2,int nkp)
601 {
602 
603  int i;
604  double fret;
605 
606  /*
607   * compute the k-permutation index from iperm for the window
608   * at data[offset] of length len.  If it matches pind, return
609   * the first quantity, otherwise return the second.
610   */
611  if(pi1 == pi2){
612 
613    fret = (double) (nkp - 1.0)/nkp;
614    if(verbose < 0){
615      printf(" f(%d,%d) = %10.6f\n",pi1,pi2,fret);
616    }
617    return(fret);
618 
619  } else {
620 
621    fret = (double) (-1.0/nkp);
622    if(verbose < 0){
623      printf(" f(%d,%d) = %10.6f\n",pi1,pi2,fret);
624    }
625    return(fret);
626 
627  }
628 
629 
630 }
631 
632 
633 
634 
635