1 /*
2  * ========================================================================
3  * See copyright in copyright.h and the accompanying file COPYING
4  * ========================================================================
5  */
6 
7 /*
8  *========================================================================
9  * This is the Diehard Count a Stream of 1's test, rewritten from the
10  * description in tests.txt on George Marsaglia's diehard site.
11  *
12  *     This is the COUNT-THE-1's TEST for specific bytes.        ::
13  * Consider the file under test as a stream of 32-bit integers.  ::
14  * From each integer, a specific byte is chosen , say the left-  ::
15  * most::  bits 1 to 8. Each byte can contain from 0 to 8 1's,   ::
16  * with probabilitie 1,8,28,56,70,56,28,8,1 over 256.  Now let   ::
17  * the specified bytes from successive integers provide a string ::
18  * of (overlapping) 5-letter words, each "letter" taking values  ::
19  * A,B,C,D,E. The letters are determined  by the number of 1's,  ::
20  * in that byte::  0,1,or 2 ---> A, 3 ---> B, 4 ---> C, 5 ---> D,::
21  * and  6,7 or 8 ---> E.  Thus we have a monkey at a typewriter  ::
22  * hitting five keys with with various probabilities::  37,56,70,::
23  * 56,37 over 256. There are 5^5 possible 5-letter words, and    ::
24  * from a string of 256,000 (overlapping) 5-letter words, counts ::
25  * are made on the frequencies for each word. The quadratic form ::
26  * in the weak inverse of the covariance matrix of the cell      ::
27  * counts provides a chisquare test::  Q5-Q4, the difference of  ::
28  * the naive Pearson  sums of (OBS-EXP)^2/EXP on counts for 5-   ::
29  * and 4-letter cell counts.                                     ::
30  *
31  *                       Comment
32  *
33  * For the less statistically trained amongst us, the translation
34  * of the above is:
35  *  Generate a string of base-5 digits derived as described from
36  * specific (randomly chosen) byte offsets in a random integer.
37  *  Turn four and five digit base 5 numbers (created from these digits)
38  * into indices and incrementally count the frequency of occurrence of
39  * each index.
40  *  Compute the expected value of these counts given tsamples samples,
41  * and thereby (from the vector of actual vs expected counts) generate
42  * a chisq for 4 and 5 digit numbers separately.  These chisq's are
43  * expected, for a large number of trials, to be equal to the number of
44  * degrees of freedom of the vectors, (5^5 - 1) or (5^4 - 1).  Generate
45  * the mean DIFFERENCE = 2500 as the expected value of the difference
46  * chisq_5 - chisq_4 and compute a pvalue based on this expected value and
47  * the associated standard deviation sqrt(2*2500).
48  *
49  * Note:  The byte offset is systematically cycled PER tsample, so enough
50  * tsamples gives one a reasonable chance of discovering "bad" offsets
51  * in any exist.  How many is enough?  Difficult to say a priori -- it
52  * depends on how bit the failure is and how many offsets create a
53  * failure.  Play around with it.
54  *
55  * Note also that the code itself is a bit simpler than the stream
56  * version (no need to worry about e.g. overlap or modulus 4/5 when
57  * getting the next int/byte) but that it generates 4x as many rands
58  * as non-overlapping stream: tsamples*psamples*5, basically.
59  *
60  * This test is actually LESS stringent than the stream version overall,
61  * and is much closer to rgb_bitdist.  In fact, if rgb_bitdist for 8 bits
62  * fails the diehard_count_1s_byte test MUST fail as it too samples all
63  * the different offsets systematically.  However, as before, MOST failures
64  * at 8 bits are derived from failures at smaller numbers of bits (this
65  * is an assertion that can be made precise in terms of contributing
66  * permutations) and again, the test is vastly less sensitive than
67  * rgb_bitdist and is less senstive that diehard_count_1s_stream EXCEPT
68  * that it samples more rands and might reveal problems with specific
69  * offsets ignored by the stream test.  Honestly, I could fix the stream
70  * test to cycle through the possible bitlevel offsets and make this
71  * test completely obsolete.
72  *========================================================================
73  */
74 
75 
76 #include <dieharder/libdieharder.h>
77 
78 /*
79  * Include inline uint generator
80  */
81 #include "static_get_bits.c"
82 
83 /*
84  * This table was generated using the following code fragment.
85  {
86    char table[256];
87    table[i] = 0;
88    for(j=0;j<8*sizeof(uint);j++){
89      table[i] += get_int_bit(i,j);
90    }
91    switch(table[i]){
92      case 0:
93      case 1:
94      case 2:
95        table[i] = 0;
96        break;
97      case 3:
98        table[i] = 1;
99        break;
100      case 4:
101        table[i] = 2;
102        break;
103      case 5:
104        table[i] = 3;
105        break;
106      case 6:
107      case 7:
108      case 8:
109        table[i] = 4;
110        break;
111      default:
112        fprintf(stderr,"Hahahahah\n");
113        exit(0);
114        break;
115    }
116  }
117  */
118 
119 char b5b[] = {
120 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 2,
121 0, 0, 0, 1, 0, 1, 1, 2, 0, 1, 1, 2, 1, 2, 2, 3,
122 0, 0, 0, 1, 0, 1, 1, 2, 0, 1, 1, 2, 1, 2, 2, 3,
123 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
124 0, 0, 0, 1, 0, 1, 1, 2, 0, 1, 1, 2, 1, 2, 2, 3,
125 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
126 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
127 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 4,
128 0, 0, 0, 1, 0, 1, 1, 2, 0, 1, 1, 2, 1, 2, 2, 3,
129 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
130 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
131 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 4,
132 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
133 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 4,
134 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 4,
135 2, 3, 3, 4, 3, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4};
136 
137 /*
138  * The following is needed to generate the test statistic
139  *
140  * Vector of probabilities for each integer.
141  * 37.0/256.0,56.0/256.0,70.0/256.0,56.0/256.0,37.0/256.0
142  */
143 const double pb[]={
144 0.144531250,
145 0.218750000,
146 0.273437500,
147 0.218750000,
148 0.144531250};
149 
150 
151 
152 /*
153  * Useful macro for building 4 and 5 digit base 5 numbers from
154  * base 5 digits
155  */
156 #define LSHIFT5(old,new) (old*5 + new)
157 
diehard_count_1s_byte(Test ** test,int irun)158 int diehard_count_1s_byte(Test **test, int irun)
159 {
160 
161  uint i,j,k,index5=0,index4,letter,t;
162  uint boffset;
163  Vtest vtest4,vtest5;
164  Xtest ptest;
165 
166  /*
167   * count_1s in specific bytes is straightforward after looking over
168   * count_1s in a byte.  The statistic is identical; we just have to
169   * cycle the offset of the bytes selected and generate 1 random uint
170   * per digit.
171   */
172 
173  /*
174   * I'm leaving this in so the chronically bored can validate that
175   * the table exists and is correctly loaded and addressable.
176   */
177  if(verbose == -1){
178    for(i=0;i<256;i++){
179      printf("%u, ",b5b[i]);
180      /* dumpbits(&i,8); */
181      if((i+1)%16 == 0){
182        printf("\n");
183      }
184    }
185    exit(0);
186  }
187 
188  /*
189   * for display only.  0 means "ignored".
190   */
191  test[0]->ntuple = 0;
192 
193  /*
194   * This is basically a pair of parallel vtests, with a final test
195   * statistic generated from their difference (somehow).  We therefore
196   * create two vtests, one for four digit base 5 integers and one for
197   * five digit base 5 integers, and generate their expected values for
198   * test[0]->tsamples trials.
199   */
200 
201  ptest.y = 2500.0;
202  ptest.sigma = sqrt(5000.0);
203 
204  Vtest_create(&vtest4,625);
205  vtest4.cutoff = 5.0;
206  for(i=0;i<625;i++){
207    j = i;
208    vtest4.y[i] = test[0]->tsamples;
209    vtest4.x[i] = 0.0;
210    /*
211     * Digitize base 5, compute expected value for THIS integer i.
212     */
213    /* printf("%u:  ",j); */
214    for(k=0;k<4;k++){
215      /*
216       * Take the least significant "letter" of j in range 0-4
217       */
218      letter = j%5;
219      /*
220       * multiply by the probability of getting this letter
221       */
222      vtest4.y[i] *= pb[letter];
223      /*
224       * Right shift j to get next digit.
225       */
226      /* printf("%1u",letter); */
227      j /= 5;
228    }
229    /* printf(" = %f\n",vtest4.y[i]); */
230  }
231 
232  Vtest_create(&vtest5,3125);
233  vtest5.cutoff = 5.0;
234  for(i=0;i<3125;i++){
235    j = i;
236    vtest5.y[i] = test[0]->tsamples;
237    vtest5.x[i] = 0.0;
238    /*
239     * Digitize base 5, compute expected value for THIS integer i.
240     */
241    for(k=0;k<5;k++){
242      /*
243       * Take the least significant "letter" of j in range 0-4
244       */
245      letter = j%5;
246      /*
247       * multiply by the probability of getting this letter
248       */
249      vtest5.y[i] *= pb[letter];
250      /*
251       * Right shift j to get next digit.
252       */
253      j /= 5;
254    }
255  }
256 
257  /*
258   * Here is the test.  We cycle boffset through test[0]->tsamples
259   */
260  boffset = 0;
261  for(t=0;t<test[0]->tsamples;t++){
262 
263    boffset = t%32;  /* Remember that get_bit_ntuple periodic wraps the uint */
264    /*
265     * Get the next five bytes and make an index5 out of them, no
266     * overlap.
267     */
268    for(k=0;k<5;k++){
269      i = get_rand_bits_uint(32, 0xFFFFFFFF, rng);
270      if(verbose == D_DIEHARD_COUNT_1S_STREAM || verbose == D_ALL){
271        dumpbits(&i,32);
272      }
273      /*
274       * get next byte from the last rand we generated.
275       * Bauer fix -
276       *   Cruft: j = get_bit_ntuple_from_uint(i,8,0x000000FF,boffset);
277       */
278      j = get_bit_ntuple_from_whole_uint(i,8,0x000000FF,boffset);
279      index5 = LSHIFT5(index5,b5b[j]);
280      if(verbose == D_DIEHARD_COUNT_1S_STREAM || verbose == D_ALL){
281        printf("b5b[%u] = %u, index5 = %u\n",j,b5b[j],index5);
282        dumpbits(&j,8);
283      }
284    }
285    /*
286     * We use modulus to throw away the sixth digit in the left-shifted
287     * base 5 index value, keeping the value of the 5-digit base 5 number
288     * in the range 0 to 5^5-1 or 0 to 3124 decimal.  We repeat for the
289     * four digit index.  At this point we increment the counts for index4
290     * and index5.  Tres simple, no?
291     */
292    index5 = index5%3125;
293    index4 = index5%625;
294    vtest4.x[index4]++;
295    vtest5.x[index5]++;
296  }
297  /*
298   * OK, all that is left now is to figure out the statistic.
299   */
300  if(verbose == D_DIEHARD_COUNT_1S_BYTE || verbose == D_ALL){
301    for(i = 0;i<625;i++){
302      printf("%u:  %f    %f\n",i,vtest4.y[i],vtest4.x[i]);
303    }
304    for(i = 0;i<3125;i++){
305      printf("%u:  %f    %f\n",i,vtest5.y[i],vtest5.x[i]);
306    }
307  }
308  Vtest_eval(&vtest4);
309  Vtest_eval(&vtest5);
310  if(verbose == D_DIEHARD_COUNT_1S_BYTE || verbose == D_ALL){
311    printf("vtest4.chisq = %f   vtest5.chisq = %f\n",vtest4.chisq,vtest5.chisq);
312  }
313  ptest.x = vtest5.chisq - vtest4.chisq;
314 
315  Xtest_eval(&ptest);
316  test[0]->pvalues[irun] = ptest.pvalue;
317 
318  MYDEBUG(D_DIEHARD_COUNT_1S_BYTE) {
319    printf("# diehard_count_1s_byte(): test[0]->pvalues[%u] = %10.5f\n",irun,test[0]->pvalues[irun]);
320  }
321 
322 
323  Vtest_destroy(&vtest4);
324  Vtest_destroy(&vtest5);
325 
326  return(0);
327 }
328 
329