1 /*
2 * See copyright in copyright.h and the accompanying file COPYING
3 */
4
5 /*
6 *========================================================================
7 * This is the Diehard Count a Stream of 1's test, rewritten from the
8 * description in tests.txt on George Marsaglia's diehard site.
9 *
10 * This is the COUNT-THE-1's TEST on a stream of bytes. ::
11 * Consider the file under test as a stream of bytes (four per ::
12 * 32 bit integer). Each byte can contain from 0 to 8 1's, ::
13 * with probabilities 1,8,28,56,70,56,28,8,1 over 256. Now let ::
14 * the stream of bytes provide a string of overlapping 5-letter ::
15 * words, each "letter" taking values A,B,C,D,E. The letters are ::
16 * determined by the number of 1's in a byte:: 0,1,or 2 yield A,::
17 * 3 yields B, 4 yields C, 5 yields D and 6,7 or 8 yield E. Thus ::
18 * we have a monkey at a typewriter hitting five keys with vari- ::
19 * ous probabilities (37,56,70,56,37 over 256). There are 5^5 ::
20 * possible 5-letter words, and from a string of 256,000 (over- ::
21 * lapping) 5-letter words, counts are made on the frequencies ::
22 * for each word. The quadratic form in the weak inverse of ::
23 * the covariance matrix of the cell counts provides a chisquare ::
24 * test:: Q5-Q4, the difference of the naive Pearson sums of ::
25 * (OBS-EXP)^2/EXP on counts for 5- and 4-letter cell counts. ::
26 *
27 * Comment
28 *
29 * For the less statistically trained amongst us, the translation
30 * of the above is:
31 * Generate a string of base-5 digits derived as described from
32 * random bytes (where we by default generate NON-overlapping bytes
33 * but overlapping ones can be selected by setting the overlap flag
34 * on the command line).
35 * Turn four and five digit base 5 numbers (created from these digits)
36 * into indices and incrementally count the frequency of occurrence of
37 * each index.
38 * Compute the expected value of these counts given tsamples samples,
39 * and thereby (from the vector of actual vs expected counts) generate
40 * a chisq for 4 and 5 digit numbers separately. These chisq's are
41 * expected, for a large number of trials, to be equal to the number of
42 * degrees of freedom of the vectors, (5^5 - 1) or (5^4 - 1). Generate
43 * the mean DIFFERENCE = 2500 as the expected value of the difference
44 * chisq_5 - chisq_4 and compute a pvalue based on this expected value and
45 * the associated standard deviation sqrt(2*2500).
46 *
47 * This test is written so overlap can be flag-controlled and so that
48 * tsamples can be freely varied compared to diehard's presumed 256,000.
49 * As usual, it also runs at 100 times and not 10 to generate the final
50 * KS pvalue for the test.
51 *
52 * One can easily prove that this test will fail whenever rgb_bitdist
53 * fails at 40 bits and pass when rgb_bitdist passes, as rgb_bitdist
54 * tests the much more stringent requirement that the actual underlying
55 * 40 bit strings are correctly distributed not just with respect to
56 * the morphed number of 1 bits but with respect to the actual underlying
57 * bit patterns. It is, however, vastly less sensitive -- rgb_bitdist
58 * already FAILS at six bit strings for every generator thus far tested,
59 * meaning that two OCTAL digits WITHOUT any transformations or bit counts
60 * are already incorrectly distributed. It is then perfectly obvious that
61 * all bitstrings with higher numbers of bits will be incorrectly
62 * distributed as well (including 40 bit strings), but count_the_1s is
63 * distressingly insensitive to this embedded but invisible failure.
64 *========================================================================
65 */
66
67
68 #include <dieharder/libdieharder.h>
69
70 /*
71 * Include inline uint generator
72 */
73 #include "static_get_bits.c"
74
75 /*
76 * This table was generated using the following code fragment.
77 {
78 char table[256];
79 table[i] = 0;
80 for(j=0;j<8*sizeof(uint);j++){
81 table[i] += get_int_bit(i,j);
82 }
83 switch(table[i]){
84 case 0:
85 case 1:
86 case 2:
87 table[i] = 0;
88 break;
89 case 3:
90 table[i] = 1;
91 break;
92 case 4:
93 table[i] = 2;
94 break;
95 case 5:
96 table[i] = 3;
97 break;
98 case 6:
99 case 7:
100 case 8:
101 table[i] = 4;
102 break;
103 default:
104 fprintf(stderr,"Hahahahah\n");
105 exit(0);
106 break;
107 }
108 }
109 */
110
111 char b5s[] = {
112 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 2,
113 0, 0, 0, 1, 0, 1, 1, 2, 0, 1, 1, 2, 1, 2, 2, 3,
114 0, 0, 0, 1, 0, 1, 1, 2, 0, 1, 1, 2, 1, 2, 2, 3,
115 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
116 0, 0, 0, 1, 0, 1, 1, 2, 0, 1, 1, 2, 1, 2, 2, 3,
117 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
118 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
119 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 4,
120 0, 0, 0, 1, 0, 1, 1, 2, 0, 1, 1, 2, 1, 2, 2, 3,
121 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
122 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
123 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 4,
124 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
125 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 4,
126 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 4,
127 2, 3, 3, 4, 3, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4};
128
129 /*
130 * The following are needed to generate the test statistic.
131 * Note that sqrt(5000) = 70.710678118654752440084436210485
132 */
133 const double mu=2500, std=70.7106781;
134
135 /*
136 * Vector of probabilities for each integer. (All exact, btw.)
137 * 37.0/256.0,56.0/256.0,70.0/256.0,56.0/256.0,37.0/256.0
138 */
139 const double ps[]={
140 0.144531250,
141 0.218750000,
142 0.273437500,
143 0.218750000,
144 0.144531250};
145
146 #define LSHIFT5(old,new) (old*5 + new)
147
diehard_count_1s_stream(Test ** test,int irun)148 int diehard_count_1s_stream(Test **test, int irun)
149 {
150
151 uint i,j,k,index5=0,index4,letter,t;
152 uint boffset;
153 Vtest vtest4,vtest5;
154 Xtest ptest;
155 uint overlap = 1; /* leftovers/cruft */
156
157 /*
158 * Count a Stream of 1's is a very complex way of generating a statistic.
159 * We take a random stream, and turn it bytewise (overlapping or not)
160 * into a 4 and/or 5 digit base 5 integer (in the ranges 0-624 and 0-3124
161 * respectively) via the bytewise mapping in b5[] above, derived from the
162 * prescription of Marsaglia in diehard. Increment a vector for 4 and 5
163 * digit numbers separately that counts the number of times that 4, 5
164 * digit integer has occurred in the random stream. Compare these
165 * vectors to their expected values, generated from the probabilities of
166 * the occurrence of each base 5 integer in the byte map. Compute chisq
167 * for each of these vectors -- the "degrees of freedom" of the stream
168 * thus mapped.. The difference between chisq(5) and chisq(4)should be
169 * 5^4 - 5^3 = 2500 with stddev sqrt(5000), use this in an Xtest to
170 * generate the final trial statistic.
171 */
172
173 /*
174 * I'm leaving this in so the chronically bored can validate that
175 * the table exists and is correctly loaded and addressable.
176 */
177 if(verbose == -1){
178 for(i=0;i<256;i++){
179 printf("%u, ",b5s[i]);
180 /* dumpbits(&i,8); */
181 if((i+1)%16 == 0){
182 printf("\n");
183 }
184 }
185 exit(0);
186 }
187
188 /*
189 * for display only. 0 means "ignored".
190 */
191 test[0]->ntuple = 0;
192
193 /*
194 * This is basically a pair of parallel vtests, with a final test
195 * statistic generated from their difference (somehow). We therefore
196 * create two vtests, one for four digit base 5 integers and one for
197 * five digit base 5 integers, and generate their expected values for
198 * test[0]->tsamples trials.
199 */
200
201 ptest.y = 2500.0;
202 ptest.sigma = sqrt(5000.0);
203
204 Vtest_create(&vtest4,625);
205 vtest4.cutoff = 5.0;
206 for(i=0;i<625;i++){
207 j = i;
208 vtest4.y[i] = test[0]->tsamples;
209 vtest4.x[i] = 0.0;
210 /*
211 * Digitize base 5, compute expected value for THIS integer i.
212 */
213 /* printf("%u: ",j); */
214 for(k=0;k<4;k++){
215 /*
216 * Take the least significant "letter" of j in range 0-4
217 */
218 letter = j%5;
219 /*
220 * multiply by the probability of getting this letter
221 */
222 vtest4.y[i] *= ps[letter];
223 /*
224 * Right shift j to get next digit.
225 */
226 /* printf("%1u",letter); */
227 j /= 5;
228 }
229 /* printf(" = %f\n",vtest4.y[i]); */
230 }
231
232 Vtest_create(&vtest5,3125);
233 vtest5.cutoff = 5.0;
234 for(i=0;i<3125;i++){
235 j = i;
236 vtest5.y[i] = test[0]->tsamples;
237 vtest5.x[i] = 0.0;
238 /*
239 * Digitize base 5, compute expected value for THIS integer i.
240 */
241 for(k=0;k<5;k++){
242 /*
243 * Take the least significant "letter" of j in range 0-4
244 */
245 letter = j%5;
246 /*
247 * multiply by the probability of getting this letter
248 */
249 vtest5.y[i] *= ps[letter];
250 /*
251 * Right shift j to get next digit.
252 */
253 j /= 5;
254 }
255 }
256
257 /*
258 * Preload index with the four bytes of the first rand if overlapping
259 * only.
260 */
261 if(overlap){
262 i = get_rand_bits_uint(32, 0xFFFFFFFF, rng);
263 MYDEBUG(D_DIEHARD_COUNT_1S_STREAM){
264 dumpbits(&i,32);
265 }
266 /* 1st byte */
267 /*
268 * Bauer fix. I don't think he likes my getting ntuples sequentially
269 * from the stream in diehard tests, which is fair enough...;-)
270 * In the raw bit distribution tests, however, it is essential.
271 * j = get_bit_ntuple_from_uint(i,8,0x000000FF,0);
272 */
273 j = get_bit_ntuple_from_whole_uint(i,8,0x000000FF,0);
274 index5 = b5s[j];
275 if(verbose == D_DIEHARD_COUNT_1S_STREAM || verbose == D_ALL){
276 printf("b5s[%u] = %u, index5 = %u\n",j,b5s[j],index5);
277 dumpbits(&j,8);
278 }
279 /* 2nd byte */
280 /* Cruft: See above. j = get_bit_ntuple_from_uint(i,8,0x000000FF,8); */
281 j = get_bit_ntuple_from_whole_uint(i,8,0x000000FF,8);
282 index5 = LSHIFT5(index5,b5s[j]);
283 if(verbose == D_DIEHARD_COUNT_1S_STREAM || verbose == D_ALL){
284 printf("b5s[%u] = %u, index5 = %u\n",j,b5s[j],index5);
285 dumpbits(&j,8);
286 }
287 /* 3rd byte */
288 /* Cruft: See above. j = get_bit_ntuple_from_uint(i,8,0x000000FF,16); */
289 j = get_bit_ntuple_from_whole_uint(i,8,0x000000FF,16);
290 index5 = LSHIFT5(index5,b5s[j]);
291 if(verbose == D_DIEHARD_COUNT_1S_STREAM || verbose == D_ALL){
292 printf("b5s[%u] = %u, index5 = %u\n",j,b5s[j],index5);
293 dumpbits(&j,8);
294 }
295 /* 4th byte */
296 /* Cruft: See above. j = get_bit_ntuple_from_uint(i,8,0x000000FF,24); */
297 j = get_bit_ntuple_from_whole_uint(i,8,0x000000FF,24);
298 index5 = LSHIFT5(index5,b5s[j]);
299 if(verbose == D_DIEHARD_COUNT_1S_STREAM || verbose == D_ALL){
300 printf("b5s[%u] = %u, index5 = %u\n",j,b5s[j],index5);
301 dumpbits(&j,8);
302 }
303 }
304
305 boffset = 0;
306 for(t=0;t<test[0]->tsamples;t++){
307
308 /*
309 * OK, we now have to do a simple modulus decision as to whether or not
310 * a new random uint is required AND to track the byte offset. We also
311 * have to determine whether or not to use overlapping bytes. I
312 * actually think that we may need to turn index5 into a subroutine
313 * where each successive call returns the next morphed index5,
314 * overlapping or not.
315 */
316 if(overlap){
317 /*
318 * Use overlapping bytes to generate the next index5 according to
319 * the diehard prescription (designed to work with a very small
320 * input file of rands).
321 */
322 if(boffset%32 == 0){
323 /*
324 * We need a new rand to get our next byte.
325 */
326 boffset = 0;
327 i = get_rand_bits_uint(32, 0xFFFFFFFF, rng);
328 if(verbose == D_DIEHARD_COUNT_1S_STREAM || verbose == D_ALL){
329 dumpbits(&i,32);
330 }
331 }
332 /*
333 * get next byte from the last rand we generated.
334 */
335 /* Cruft: See above. j = get_bit_ntuple_from_uint(i,8,0x000000FF,boffset); */
336 j = get_bit_ntuple_from_whole_uint(i,8,0x000000FF,boffset);
337 index5 = LSHIFT5(index5,b5s[j]);
338 /*
339 * I THINK that this basically throws away the sixth digit in the
340 * left-shifted base 5 value, keeping the value of the 5-digit base 5
341 * number in the range 0 to 5^5-1 or 0 to 3124 decimal.
342 */
343 index5 = index5%3125;
344 if(verbose == D_DIEHARD_COUNT_1S_STREAM || verbose == D_ALL){
345 printf("b5s[%u] = %u, index5 = %u\n",j,b5s[j],index5);
346 dumpbits(&j,8);
347 }
348 boffset+=8;
349 } else {
350 /*
351 * Get the next five bytes and make an index5 out of them, no
352 * overlap.
353 */
354 for(k=0;k<5;k++){
355 if(boffset%32 == 0){
356 /*
357 * We need a new rand to get our next byte.
358 */
359 boffset = 0;
360 i = get_rand_bits_uint(32, 0xFFFFFFFF, rng);
361 if(verbose == D_DIEHARD_COUNT_1S_STREAM || verbose == D_ALL){
362 dumpbits(&i,32);
363 }
364 }
365 /*
366 * get next byte from the last rand we generated.
367 */
368 /* Cruft: See above. j = get_bit_ntuple_from_uint(i,8,0x000000FF,boffset); */
369 j = get_bit_ntuple_from_whole_uint(i,8,0x000000FF,boffset);
370 index5 = LSHIFT5(index5,b5s[j]);
371 if(verbose == D_DIEHARD_COUNT_1S_STREAM || verbose == D_ALL){
372 printf("b5s[%u] = %u, index5 = %u\n",j,b5s[j],index5);
373 dumpbits(&j,8);
374 }
375 boffset+=8;
376 }
377 }
378 /*
379 * We use modulus to throw away the sixth digit in the left-shifted
380 * base 5 index value, keeping the value of the 5-digit base 5 number
381 * in the range 0 to 5^5-1 or 0 to 3124 decimal. We repeat for the
382 * four digit index. At this point we increment the counts for index4
383 * and index5. Tres simple, no?
384 */
385 index5 = index5%3125;
386 index4 = index5%625;
387 vtest4.x[index4]++;
388 vtest5.x[index5]++;
389 }
390 /*
391 * OK, all that is left now is to figure out the statistic.
392 */
393 if(verbose == D_DIEHARD_COUNT_1S_STREAM || verbose == D_ALL){
394 for(i = 0;i<625;i++){
395 printf("%u: %f %f\n",i,vtest4.y[i],vtest4.x[i]);
396 }
397 for(i = 0;i<3125;i++){
398 printf("%u: %f %f\n",i,vtest5.y[i],vtest5.x[i]);
399 }
400 }
401 Vtest_eval(&vtest4);
402 Vtest_eval(&vtest5);
403 MYDEBUG(D_DIEHARD_COUNT_1S_STREAM) {
404 printf("vtest4.chisq = %f vtest5.chisq = %f\n",vtest4.chisq,vtest5.chisq);
405 }
406 ptest.x = vtest5.chisq - vtest4.chisq;
407
408 Xtest_eval(&ptest);
409 test[0]->pvalues[irun] = ptest.pvalue;
410
411 MYDEBUG(D_DIEHARD_COUNT_1S_STREAM) {
412 printf("# diehard_count_1s_stream(): test[0]->pvalues[%u] = %10.5f\n",irun,test[0]->pvalues[irun]);
413 }
414
415 Vtest_destroy(&vtest4);
416 Vtest_destroy(&vtest5);
417
418 return(0);
419
420 }
421
422