1 /*
2  * See copyright in copyright.h and the accompanying file COPYING
3  */
4 
5 /*
6  *========================================================================
7  * This is the Diehard Count a Stream of 1's test, rewritten from the
8  * description in tests.txt on George Marsaglia's diehard site.
9  *
10  *     This is the COUNT-THE-1's TEST on a stream of bytes.      ::
11  * Consider the file under test as a stream of bytes (four per   ::
12  * 32 bit integer).  Each byte can contain from 0 to 8 1's,      ::
13  * with probabilities 1,8,28,56,70,56,28,8,1 over 256.  Now let  ::
14  * the stream of bytes provide a string of overlapping  5-letter ::
15  * words, each "letter" taking values A,B,C,D,E. The letters are ::
16  * determined by the number of 1's in a byte::  0,1,or 2 yield A,::
17  * 3 yields B, 4 yields C, 5 yields D and 6,7 or 8 yield E. Thus ::
18  * we have a monkey at a typewriter hitting five keys with vari- ::
19  * ous probabilities (37,56,70,56,37 over 256).  There are 5^5   ::
20  * possible 5-letter words, and from a string of 256,000 (over-  ::
21  * lapping) 5-letter words, counts are made on the frequencies   ::
22  * for each word.   The quadratic form in the weak inverse of    ::
23  * the covariance matrix of the cell counts provides a chisquare ::
24  * test::  Q5-Q4, the difference of the naive Pearson sums of    ::
25  * (OBS-EXP)^2/EXP on counts for 5- and 4-letter cell counts.    ::
26  *
27  *                       Comment
28  *
29  * For the less statistically trained amongst us, the translation
30  * of the above is:
31  *  Generate a string of base-5 digits derived as described from
32  * random bytes (where we by default generate NON-overlapping bytes
33  * but overlapping ones can be selected by setting the overlap flag
34  * on the command line).
35  *  Turn four and five digit base 5 numbers (created from these digits)
36  * into indices and incrementally count the frequency of occurrence of
37  * each index.
38  *  Compute the expected value of these counts given tsamples samples,
39  * and thereby (from the vector of actual vs expected counts) generate
40  * a chisq for 4 and 5 digit numbers separately.  These chisq's are
41  * expected, for a large number of trials, to be equal to the number of
42  * degrees of freedom of the vectors, (5^5 - 1) or (5^4 - 1).  Generate
43  * the mean DIFFERENCE = 2500 as the expected value of the difference
44  * chisq_5 - chisq_4 and compute a pvalue based on this expected value and
45  * the associated standard deviation sqrt(2*2500).
46  *
47  * This test is written so overlap can be flag-controlled and so that
48  * tsamples can be freely varied compared to diehard's presumed 256,000.
49  * As usual, it also runs at 100 times and not 10 to generate the final
50  * KS pvalue for the test.
51  *
52  * One can easily prove that this test will fail whenever rgb_bitdist
53  * fails at 40 bits and pass when rgb_bitdist passes, as rgb_bitdist
54  * tests the much more stringent requirement that the actual underlying
55  * 40 bit strings are correctly distributed not just with respect to
56  * the morphed number of 1 bits but with respect to the actual underlying
57  * bit patterns.  It is, however, vastly less sensitive -- rgb_bitdist
58  * already FAILS at six bit strings for every generator thus far tested,
59  * meaning that two OCTAL digits WITHOUT any transformations or bit counts
60  * are already incorrectly distributed.  It is then perfectly obvious that
61  * all bitstrings with higher numbers of bits will be incorrectly
62  * distributed as well (including 40 bit strings), but count_the_1s is
63  * distressingly insensitive to this embedded but invisible failure.
64  *========================================================================
65  */
66 
67 
68 #include <dieharder/libdieharder.h>
69 
70 /*
71  * Include inline uint generator
72  */
73 #include "static_get_bits.c"
74 
75 /*
76  * This table was generated using the following code fragment.
77  {
78    char table[256];
79    table[i] = 0;
80    for(j=0;j<8*sizeof(uint);j++){
81      table[i] += get_int_bit(i,j);
82    }
83    switch(table[i]){
84      case 0:
85      case 1:
86      case 2:
87        table[i] = 0;
88        break;
89      case 3:
90        table[i] = 1;
91        break;
92      case 4:
93        table[i] = 2;
94        break;
95      case 5:
96        table[i] = 3;
97        break;
98      case 6:
99      case 7:
100      case 8:
101        table[i] = 4;
102        break;
103      default:
104        fprintf(stderr,"Hahahahah\n");
105        exit(0);
106        break;
107    }
108  }
109  */
110 
111 char b5s[] = {
112 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 2,
113 0, 0, 0, 1, 0, 1, 1, 2, 0, 1, 1, 2, 1, 2, 2, 3,
114 0, 0, 0, 1, 0, 1, 1, 2, 0, 1, 1, 2, 1, 2, 2, 3,
115 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
116 0, 0, 0, 1, 0, 1, 1, 2, 0, 1, 1, 2, 1, 2, 2, 3,
117 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
118 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
119 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 4,
120 0, 0, 0, 1, 0, 1, 1, 2, 0, 1, 1, 2, 1, 2, 2, 3,
121 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
122 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
123 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 4,
124 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
125 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 4,
126 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 4,
127 2, 3, 3, 4, 3, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4};
128 
129 /*
130  * The following are needed to generate the test statistic.
131  * Note that sqrt(5000) = 70.710678118654752440084436210485
132  */
133 const double mu=2500, std=70.7106781;
134 
135 /*
136  * Vector of probabilities for each integer.  (All exact, btw.)
137  * 37.0/256.0,56.0/256.0,70.0/256.0,56.0/256.0,37.0/256.0
138  */
139 const double ps[]={
140 0.144531250,
141 0.218750000,
142 0.273437500,
143 0.218750000,
144 0.144531250};
145 
146 #define LSHIFT5(old,new) (old*5 + new)
147 
diehard_count_1s_stream(Test ** test,int irun)148 int diehard_count_1s_stream(Test **test, int irun)
149 {
150 
151  uint i,j,k,index5=0,index4,letter,t;
152  uint boffset;
153  Vtest vtest4,vtest5;
154  Xtest ptest;
155  uint overlap = 1; /* leftovers/cruft */
156 
157  /*
158   * Count a Stream of 1's is a very complex way of generating a statistic.
159   * We take a random stream, and turn it bytewise (overlapping or not)
160   * into a 4 and/or 5 digit base 5 integer (in the ranges 0-624 and 0-3124
161   * respectively) via the bytewise mapping in b5[] above, derived from the
162   * prescription of Marsaglia in diehard.  Increment a vector for 4 and 5
163   * digit numbers separately that counts the number of times that 4, 5
164   * digit integer has occurred in the random stream.  Compare these
165   * vectors to their expected values, generated from the probabilities of
166   * the occurrence of each base 5 integer in the byte map.  Compute chisq
167   * for each of these vectors -- the "degrees of freedom" of the stream
168   * thus mapped..  The difference between chisq(5) and chisq(4)should be
169   * 5^4 - 5^3 = 2500 with stddev sqrt(5000), use this in an Xtest to
170   * generate the final trial statistic.
171   */
172 
173  /*
174   * I'm leaving this in so the chronically bored can validate that
175   * the table exists and is correctly loaded and addressable.
176   */
177  if(verbose == -1){
178    for(i=0;i<256;i++){
179      printf("%u, ",b5s[i]);
180      /* dumpbits(&i,8); */
181      if((i+1)%16 == 0){
182        printf("\n");
183      }
184    }
185    exit(0);
186  }
187 
188  /*
189   * for display only.  0 means "ignored".
190   */
191  test[0]->ntuple = 0;
192 
193  /*
194   * This is basically a pair of parallel vtests, with a final test
195   * statistic generated from their difference (somehow).  We therefore
196   * create two vtests, one for four digit base 5 integers and one for
197   * five digit base 5 integers, and generate their expected values for
198   * test[0]->tsamples trials.
199   */
200 
201  ptest.y = 2500.0;
202  ptest.sigma = sqrt(5000.0);
203 
204  Vtest_create(&vtest4,625);
205  vtest4.cutoff = 5.0;
206  for(i=0;i<625;i++){
207    j = i;
208    vtest4.y[i] = test[0]->tsamples;
209    vtest4.x[i] = 0.0;
210    /*
211     * Digitize base 5, compute expected value for THIS integer i.
212     */
213    /* printf("%u:  ",j); */
214    for(k=0;k<4;k++){
215      /*
216       * Take the least significant "letter" of j in range 0-4
217       */
218      letter = j%5;
219      /*
220       * multiply by the probability of getting this letter
221       */
222      vtest4.y[i] *= ps[letter];
223      /*
224       * Right shift j to get next digit.
225       */
226      /* printf("%1u",letter); */
227      j /= 5;
228    }
229    /* printf(" = %f\n",vtest4.y[i]); */
230  }
231 
232  Vtest_create(&vtest5,3125);
233  vtest5.cutoff = 5.0;
234  for(i=0;i<3125;i++){
235    j = i;
236    vtest5.y[i] = test[0]->tsamples;
237    vtest5.x[i] = 0.0;
238    /*
239     * Digitize base 5, compute expected value for THIS integer i.
240     */
241    for(k=0;k<5;k++){
242      /*
243       * Take the least significant "letter" of j in range 0-4
244       */
245      letter = j%5;
246      /*
247       * multiply by the probability of getting this letter
248       */
249      vtest5.y[i] *= ps[letter];
250      /*
251       * Right shift j to get next digit.
252       */
253      j /= 5;
254    }
255  }
256 
257  /*
258   * Preload index with the four bytes of the first rand if overlapping
259   * only.
260   */
261  if(overlap){
262    i = get_rand_bits_uint(32, 0xFFFFFFFF, rng);
263    MYDEBUG(D_DIEHARD_COUNT_1S_STREAM){
264      dumpbits(&i,32);
265    }
266    /* 1st byte */
267    /*
268     * Bauer fix.  I don't think he likes my getting ntuples sequentially
269     * from the stream in diehard tests, which is fair enough...;-)
270     * In the raw bit distribution tests, however, it is essential.
271     * j = get_bit_ntuple_from_uint(i,8,0x000000FF,0);
272     */
273    j = get_bit_ntuple_from_whole_uint(i,8,0x000000FF,0);
274    index5 = b5s[j];
275    if(verbose == D_DIEHARD_COUNT_1S_STREAM || verbose == D_ALL){
276      printf("b5s[%u] = %u, index5 = %u\n",j,b5s[j],index5);
277      dumpbits(&j,8);
278    }
279    /* 2nd byte */
280    /* Cruft:  See above.  j = get_bit_ntuple_from_uint(i,8,0x000000FF,8); */
281    j = get_bit_ntuple_from_whole_uint(i,8,0x000000FF,8);
282    index5 = LSHIFT5(index5,b5s[j]);
283    if(verbose == D_DIEHARD_COUNT_1S_STREAM || verbose == D_ALL){
284      printf("b5s[%u] = %u, index5 = %u\n",j,b5s[j],index5);
285      dumpbits(&j,8);
286    }
287    /* 3rd byte */
288    /* Cruft:  See above. j = get_bit_ntuple_from_uint(i,8,0x000000FF,16); */
289    j = get_bit_ntuple_from_whole_uint(i,8,0x000000FF,16);
290    index5 = LSHIFT5(index5,b5s[j]);
291    if(verbose == D_DIEHARD_COUNT_1S_STREAM || verbose == D_ALL){
292      printf("b5s[%u] = %u, index5 = %u\n",j,b5s[j],index5);
293      dumpbits(&j,8);
294    }
295    /* 4th byte */
296    /* Cruft:  See above. j = get_bit_ntuple_from_uint(i,8,0x000000FF,24); */
297    j = get_bit_ntuple_from_whole_uint(i,8,0x000000FF,24);
298    index5 = LSHIFT5(index5,b5s[j]);
299    if(verbose == D_DIEHARD_COUNT_1S_STREAM || verbose == D_ALL){
300      printf("b5s[%u] = %u, index5 = %u\n",j,b5s[j],index5);
301      dumpbits(&j,8);
302    }
303  }
304 
305  boffset = 0;
306  for(t=0;t<test[0]->tsamples;t++){
307 
308    /*
309     * OK, we now have to do a simple modulus decision as to whether or not
310     * a new random uint is required AND to track the byte offset.  We also
311     * have to determine whether or not to use overlapping bytes.  I
312     * actually think that we may need to turn index5 into a subroutine
313     * where each successive call returns the next morphed index5,
314     * overlapping or not.
315     */
316    if(overlap){
317      /*
318       * Use overlapping bytes to generate the next index5 according to
319       * the diehard prescription (designed to work with a very small
320       * input file of rands).
321       */
322      if(boffset%32 == 0){
323        /*
324         * We need a new rand to get our next byte.
325         */
326        boffset = 0;
327        i = get_rand_bits_uint(32, 0xFFFFFFFF, rng);
328        if(verbose == D_DIEHARD_COUNT_1S_STREAM || verbose == D_ALL){
329          dumpbits(&i,32);
330        }
331      }
332      /*
333       * get next byte from the last rand we generated.
334       */
335      /* Cruft:  See above.  j = get_bit_ntuple_from_uint(i,8,0x000000FF,boffset); */
336      j = get_bit_ntuple_from_whole_uint(i,8,0x000000FF,boffset);
337      index5 = LSHIFT5(index5,b5s[j]);
338      /*
339       * I THINK that this basically throws away the sixth digit in the
340       * left-shifted base 5 value, keeping the value of the 5-digit base 5
341       * number in the range 0 to 5^5-1 or 0 to 3124 decimal.
342       */
343      index5 = index5%3125;
344      if(verbose == D_DIEHARD_COUNT_1S_STREAM || verbose == D_ALL){
345        printf("b5s[%u] = %u, index5 = %u\n",j,b5s[j],index5);
346        dumpbits(&j,8);
347      }
348      boffset+=8;
349    } else {
350      /*
351       * Get the next five bytes and make an index5 out of them, no
352       * overlap.
353       */
354      for(k=0;k<5;k++){
355        if(boffset%32 == 0){
356          /*
357 	  * We need a new rand to get our next byte.
358 	  */
359          boffset = 0;
360          i = get_rand_bits_uint(32, 0xFFFFFFFF, rng);
361          if(verbose == D_DIEHARD_COUNT_1S_STREAM || verbose == D_ALL){
362            dumpbits(&i,32);
363          }
364        }
365        /*
366         * get next byte from the last rand we generated.
367         */
368        /* Cruft:  See above. j = get_bit_ntuple_from_uint(i,8,0x000000FF,boffset); */
369        j = get_bit_ntuple_from_whole_uint(i,8,0x000000FF,boffset);
370        index5 = LSHIFT5(index5,b5s[j]);
371        if(verbose == D_DIEHARD_COUNT_1S_STREAM || verbose == D_ALL){
372          printf("b5s[%u] = %u, index5 = %u\n",j,b5s[j],index5);
373          dumpbits(&j,8);
374        }
375        boffset+=8;
376      }
377    }
378    /*
379     * We use modulus to throw away the sixth digit in the left-shifted
380     * base 5 index value, keeping the value of the 5-digit base 5 number
381     * in the range 0 to 5^5-1 or 0 to 3124 decimal.  We repeat for the
382     * four digit index.  At this point we increment the counts for index4
383     * and index5.  Tres simple, no?
384     */
385    index5 = index5%3125;
386    index4 = index5%625;
387    vtest4.x[index4]++;
388    vtest5.x[index5]++;
389  }
390  /*
391   * OK, all that is left now is to figure out the statistic.
392   */
393  if(verbose == D_DIEHARD_COUNT_1S_STREAM || verbose == D_ALL){
394    for(i = 0;i<625;i++){
395      printf("%u:  %f    %f\n",i,vtest4.y[i],vtest4.x[i]);
396    }
397    for(i = 0;i<3125;i++){
398      printf("%u:  %f    %f\n",i,vtest5.y[i],vtest5.x[i]);
399    }
400  }
401  Vtest_eval(&vtest4);
402  Vtest_eval(&vtest5);
403  MYDEBUG(D_DIEHARD_COUNT_1S_STREAM) {
404    printf("vtest4.chisq = %f   vtest5.chisq = %f\n",vtest4.chisq,vtest5.chisq);
405  }
406  ptest.x = vtest5.chisq - vtest4.chisq;
407 
408  Xtest_eval(&ptest);
409  test[0]->pvalues[irun] = ptest.pvalue;
410 
411  MYDEBUG(D_DIEHARD_COUNT_1S_STREAM) {
412    printf("# diehard_count_1s_stream(): test[0]->pvalues[%u] = %10.5f\n",irun,test[0]->pvalues[irun]);
413  }
414 
415  Vtest_destroy(&vtest4);
416  Vtest_destroy(&vtest5);
417 
418  return(0);
419 
420 }
421 
422