1 /******************************************************************************
2 ** Filename: cluster.cpp
3 ** Purpose: Routines for clustering points in N-D space
4 ** Author: Dan Johnson
5 **
6 ** (c) Copyright Hewlett-Packard Company, 1988.
7 ** Licensed under the Apache License, Version 2.0 (the "License");
8 ** you may not use this file except in compliance with the License.
9 ** You may obtain a copy of the License at
10 ** http://www.apache.org/licenses/LICENSE-2.0
11 ** Unless required by applicable law or agreed to in writing, software
12 ** distributed under the License is distributed on an "AS IS" BASIS,
13 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 ** See the License for the specific language governing permissions and
15 ** limitations under the License.
16 *****************************************************************************/
17
18 #define _USE_MATH_DEFINES // for M_PI
19
20 #include "cluster.h"
21
22 #include "genericheap.h"
23 #include "kdpair.h"
24 #include "matrix.h"
25 #include "tprintf.h"
26
27 #include "helpers.h"
28
29 #include <cfloat> // for FLT_MAX
30 #include <cmath> // for M_PI
31 #include <array> // for std::array
32 #include <vector> // for std::vector
33
34 namespace tesseract {
35
36 #define HOTELLING 1 // If true use Hotelling's test to decide where to split.
37 #define FTABLE_X 10 // Size of FTable.
38 #define FTABLE_Y 100 // Size of FTable.
39
40 // Table of values approximating the cumulative F-distribution for a confidence
41 // of 1%.
42 const double FTable[FTABLE_Y][FTABLE_X] = {
43 {
44 4052.19,
45 4999.52,
46 5403.34,
47 5624.62,
48 5763.65,
49 5858.97,
50 5928.33,
51 5981.10,
52 6022.50,
53 6055.85,
54 },
55 {
56 98.502,
57 99.000,
58 99.166,
59 99.249,
60 99.300,
61 99.333,
62 99.356,
63 99.374,
64 99.388,
65 99.399,
66 },
67 {
68 34.116,
69 30.816,
70 29.457,
71 28.710,
72 28.237,
73 27.911,
74 27.672,
75 27.489,
76 27.345,
77 27.229,
78 },
79 {
80 21.198,
81 18.000,
82 16.694,
83 15.977,
84 15.522,
85 15.207,
86 14.976,
87 14.799,
88 14.659,
89 14.546,
90 },
91 {
92 16.258,
93 13.274,
94 12.060,
95 11.392,
96 10.967,
97 10.672,
98 10.456,
99 10.289,
100 10.158,
101 10.051,
102 },
103 {
104 13.745,
105 10.925,
106 9.780,
107 9.148,
108 8.746,
109 8.466,
110 8.260,
111 8.102,
112 7.976,
113 7.874,
114 },
115 {
116 12.246,
117 9.547,
118 8.451,
119 7.847,
120 7.460,
121 7.191,
122 6.993,
123 6.840,
124 6.719,
125 6.620,
126 },
127 {
128 11.259,
129 8.649,
130 7.591,
131 7.006,
132 6.632,
133 6.371,
134 6.178,
135 6.029,
136 5.911,
137 5.814,
138 },
139 {
140 10.561,
141 8.022,
142 6.992,
143 6.422,
144 6.057,
145 5.802,
146 5.613,
147 5.467,
148 5.351,
149 5.257,
150 },
151 {
152 10.044,
153 7.559,
154 6.552,
155 5.994,
156 5.636,
157 5.386,
158 5.200,
159 5.057,
160 4.942,
161 4.849,
162 },
163 {
164 9.646,
165 7.206,
166 6.217,
167 5.668,
168 5.316,
169 5.069,
170 4.886,
171 4.744,
172 4.632,
173 4.539,
174 },
175 {
176 9.330,
177 6.927,
178 5.953,
179 5.412,
180 5.064,
181 4.821,
182 4.640,
183 4.499,
184 4.388,
185 4.296,
186 },
187 {
188 9.074,
189 6.701,
190 5.739,
191 5.205,
192 4.862,
193 4.620,
194 4.441,
195 4.302,
196 4.191,
197 4.100,
198 },
199 {
200 8.862,
201 6.515,
202 5.564,
203 5.035,
204 4.695,
205 4.456,
206 4.278,
207 4.140,
208 4.030,
209 3.939,
210 },
211 {
212 8.683,
213 6.359,
214 5.417,
215 4.893,
216 4.556,
217 4.318,
218 4.142,
219 4.004,
220 3.895,
221 3.805,
222 },
223 {
224 8.531,
225 6.226,
226 5.292,
227 4.773,
228 4.437,
229 4.202,
230 4.026,
231 3.890,
232 3.780,
233 3.691,
234 },
235 {
236 8.400,
237 6.112,
238 5.185,
239 4.669,
240 4.336,
241 4.102,
242 3.927,
243 3.791,
244 3.682,
245 3.593,
246 },
247 {
248 8.285,
249 6.013,
250 5.092,
251 4.579,
252 4.248,
253 4.015,
254 3.841,
255 3.705,
256 3.597,
257 3.508,
258 },
259 {
260 8.185,
261 5.926,
262 5.010,
263 4.500,
264 4.171,
265 3.939,
266 3.765,
267 3.631,
268 3.523,
269 3.434,
270 },
271 {
272 8.096,
273 5.849,
274 4.938,
275 4.431,
276 4.103,
277 3.871,
278 3.699,
279 3.564,
280 3.457,
281 3.368,
282 },
283 {
284 8.017,
285 5.780,
286 4.874,
287 4.369,
288 4.042,
289 3.812,
290 3.640,
291 3.506,
292 3.398,
293 3.310,
294 },
295 {
296 7.945,
297 5.719,
298 4.817,
299 4.313,
300 3.988,
301 3.758,
302 3.587,
303 3.453,
304 3.346,
305 3.258,
306 },
307 {
308 7.881,
309 5.664,
310 4.765,
311 4.264,
312 3.939,
313 3.710,
314 3.539,
315 3.406,
316 3.299,
317 3.211,
318 },
319 {
320 7.823,
321 5.614,
322 4.718,
323 4.218,
324 3.895,
325 3.667,
326 3.496,
327 3.363,
328 3.256,
329 3.168,
330 },
331 {
332 7.770,
333 5.568,
334 4.675,
335 4.177,
336 3.855,
337 3.627,
338 3.457,
339 3.324,
340 3.217,
341 3.129,
342 },
343 {
344 7.721,
345 5.526,
346 4.637,
347 4.140,
348 3.818,
349 3.591,
350 3.421,
351 3.288,
352 3.182,
353 3.094,
354 },
355 {
356 7.677,
357 5.488,
358 4.601,
359 4.106,
360 3.785,
361 3.558,
362 3.388,
363 3.256,
364 3.149,
365 3.062,
366 },
367 {
368 7.636,
369 5.453,
370 4.568,
371 4.074,
372 3.754,
373 3.528,
374 3.358,
375 3.226,
376 3.120,
377 3.032,
378 },
379 {
380 7.598,
381 5.420,
382 4.538,
383 4.045,
384 3.725,
385 3.499,
386 3.330,
387 3.198,
388 3.092,
389 3.005,
390 },
391 {
392 7.562,
393 5.390,
394 4.510,
395 4.018,
396 3.699,
397 3.473,
398 3.305,
399 3.173,
400 3.067,
401 2.979,
402 },
403 {
404 7.530,
405 5.362,
406 4.484,
407 3.993,
408 3.675,
409 3.449,
410 3.281,
411 3.149,
412 3.043,
413 2.955,
414 },
415 {
416 7.499,
417 5.336,
418 4.459,
419 3.969,
420 3.652,
421 3.427,
422 3.258,
423 3.127,
424 3.021,
425 2.934,
426 },
427 {
428 7.471,
429 5.312,
430 4.437,
431 3.948,
432 3.630,
433 3.406,
434 3.238,
435 3.106,
436 3.000,
437 2.913,
438 },
439 {
440 7.444,
441 5.289,
442 4.416,
443 3.927,
444 3.611,
445 3.386,
446 3.218,
447 3.087,
448 2.981,
449 2.894,
450 },
451 {
452 7.419,
453 5.268,
454 4.396,
455 3.908,
456 3.592,
457 3.368,
458 3.200,
459 3.069,
460 2.963,
461 2.876,
462 },
463 {
464 7.396,
465 5.248,
466 4.377,
467 3.890,
468 3.574,
469 3.351,
470 3.183,
471 3.052,
472 2.946,
473 2.859,
474 },
475 {
476 7.373,
477 5.229,
478 4.360,
479 3.873,
480 3.558,
481 3.334,
482 3.167,
483 3.036,
484 2.930,
485 2.843,
486 },
487 {
488 7.353,
489 5.211,
490 4.343,
491 3.858,
492 3.542,
493 3.319,
494 3.152,
495 3.021,
496 2.915,
497 2.828,
498 },
499 {
500 7.333,
501 5.194,
502 4.327,
503 3.843,
504 3.528,
505 3.305,
506 3.137,
507 3.006,
508 2.901,
509 2.814,
510 },
511 {
512 7.314,
513 5.179,
514 4.313,
515 3.828,
516 3.514,
517 3.291,
518 3.124,
519 2.993,
520 2.888,
521 2.801,
522 },
523 {
524 7.296,
525 5.163,
526 4.299,
527 3.815,
528 3.501,
529 3.278,
530 3.111,
531 2.980,
532 2.875,
533 2.788,
534 },
535 {
536 7.280,
537 5.149,
538 4.285,
539 3.802,
540 3.488,
541 3.266,
542 3.099,
543 2.968,
544 2.863,
545 2.776,
546 },
547 {
548 7.264,
549 5.136,
550 4.273,
551 3.790,
552 3.476,
553 3.254,
554 3.087,
555 2.957,
556 2.851,
557 2.764,
558 },
559 {
560 7.248,
561 5.123,
562 4.261,
563 3.778,
564 3.465,
565 3.243,
566 3.076,
567 2.946,
568 2.840,
569 2.754,
570 },
571 {
572 7.234,
573 5.110,
574 4.249,
575 3.767,
576 3.454,
577 3.232,
578 3.066,
579 2.935,
580 2.830,
581 2.743,
582 },
583 {
584 7.220,
585 5.099,
586 4.238,
587 3.757,
588 3.444,
589 3.222,
590 3.056,
591 2.925,
592 2.820,
593 2.733,
594 },
595 {
596 7.207,
597 5.087,
598 4.228,
599 3.747,
600 3.434,
601 3.213,
602 3.046,
603 2.916,
604 2.811,
605 2.724,
606 },
607 {
608 7.194,
609 5.077,
610 4.218,
611 3.737,
612 3.425,
613 3.204,
614 3.037,
615 2.907,
616 2.802,
617 2.715,
618 },
619 {
620 7.182,
621 5.066,
622 4.208,
623 3.728,
624 3.416,
625 3.195,
626 3.028,
627 2.898,
628 2.793,
629 2.706,
630 },
631 {
632 7.171,
633 5.057,
634 4.199,
635 3.720,
636 3.408,
637 3.186,
638 3.020,
639 2.890,
640 2.785,
641 2.698,
642 },
643 {
644 7.159,
645 5.047,
646 4.191,
647 3.711,
648 3.400,
649 3.178,
650 3.012,
651 2.882,
652 2.777,
653 2.690,
654 },
655 {
656 7.149,
657 5.038,
658 4.182,
659 3.703,
660 3.392,
661 3.171,
662 3.005,
663 2.874,
664 2.769,
665 2.683,
666 },
667 {
668 7.139,
669 5.030,
670 4.174,
671 3.695,
672 3.384,
673 3.163,
674 2.997,
675 2.867,
676 2.762,
677 2.675,
678 },
679 {
680 7.129,
681 5.021,
682 4.167,
683 3.688,
684 3.377,
685 3.156,
686 2.990,
687 2.860,
688 2.755,
689 2.668,
690 },
691 {
692 7.119,
693 5.013,
694 4.159,
695 3.681,
696 3.370,
697 3.149,
698 2.983,
699 2.853,
700 2.748,
701 2.662,
702 },
703 {
704 7.110,
705 5.006,
706 4.152,
707 3.674,
708 3.363,
709 3.143,
710 2.977,
711 2.847,
712 2.742,
713 2.655,
714 },
715 {
716 7.102,
717 4.998,
718 4.145,
719 3.667,
720 3.357,
721 3.136,
722 2.971,
723 2.841,
724 2.736,
725 2.649,
726 },
727 {
728 7.093,
729 4.991,
730 4.138,
731 3.661,
732 3.351,
733 3.130,
734 2.965,
735 2.835,
736 2.730,
737 2.643,
738 },
739 {
740 7.085,
741 4.984,
742 4.132,
743 3.655,
744 3.345,
745 3.124,
746 2.959,
747 2.829,
748 2.724,
749 2.637,
750 },
751 {
752 7.077,
753 4.977,
754 4.126,
755 3.649,
756 3.339,
757 3.119,
758 2.953,
759 2.823,
760 2.718,
761 2.632,
762 },
763 {
764 7.070,
765 4.971,
766 4.120,
767 3.643,
768 3.333,
769 3.113,
770 2.948,
771 2.818,
772 2.713,
773 2.626,
774 },
775 {
776 7.062,
777 4.965,
778 4.114,
779 3.638,
780 3.328,
781 3.108,
782 2.942,
783 2.813,
784 2.708,
785 2.621,
786 },
787 {
788 7.055,
789 4.959,
790 4.109,
791 3.632,
792 3.323,
793 3.103,
794 2.937,
795 2.808,
796 2.703,
797 2.616,
798 },
799 {
800 7.048,
801 4.953,
802 4.103,
803 3.627,
804 3.318,
805 3.098,
806 2.932,
807 2.803,
808 2.698,
809 2.611,
810 },
811 {
812 7.042,
813 4.947,
814 4.098,
815 3.622,
816 3.313,
817 3.093,
818 2.928,
819 2.798,
820 2.693,
821 2.607,
822 },
823 {
824 7.035,
825 4.942,
826 4.093,
827 3.618,
828 3.308,
829 3.088,
830 2.923,
831 2.793,
832 2.689,
833 2.602,
834 },
835 {
836 7.029,
837 4.937,
838 4.088,
839 3.613,
840 3.304,
841 3.084,
842 2.919,
843 2.789,
844 2.684,
845 2.598,
846 },
847 {
848 7.023,
849 4.932,
850 4.083,
851 3.608,
852 3.299,
853 3.080,
854 2.914,
855 2.785,
856 2.680,
857 2.593,
858 },
859 {
860 7.017,
861 4.927,
862 4.079,
863 3.604,
864 3.295,
865 3.075,
866 2.910,
867 2.781,
868 2.676,
869 2.589,
870 },
871 {
872 7.011,
873 4.922,
874 4.074,
875 3.600,
876 3.291,
877 3.071,
878 2.906,
879 2.777,
880 2.672,
881 2.585,
882 },
883 {
884 7.006,
885 4.917,
886 4.070,
887 3.596,
888 3.287,
889 3.067,
890 2.902,
891 2.773,
892 2.668,
893 2.581,
894 },
895 {
896 7.001,
897 4.913,
898 4.066,
899 3.591,
900 3.283,
901 3.063,
902 2.898,
903 2.769,
904 2.664,
905 2.578,
906 },
907 {
908 6.995,
909 4.908,
910 4.062,
911 3.588,
912 3.279,
913 3.060,
914 2.895,
915 2.765,
916 2.660,
917 2.574,
918 },
919 {
920 6.990,
921 4.904,
922 4.058,
923 3.584,
924 3.275,
925 3.056,
926 2.891,
927 2.762,
928 2.657,
929 2.570,
930 },
931 {
932 6.985,
933 4.900,
934 4.054,
935 3.580,
936 3.272,
937 3.052,
938 2.887,
939 2.758,
940 2.653,
941 2.567,
942 },
943 {
944 6.981,
945 4.896,
946 4.050,
947 3.577,
948 3.268,
949 3.049,
950 2.884,
951 2.755,
952 2.650,
953 2.563,
954 },
955 {
956 6.976,
957 4.892,
958 4.047,
959 3.573,
960 3.265,
961 3.046,
962 2.881,
963 2.751,
964 2.647,
965 2.560,
966 },
967 {
968 6.971,
969 4.888,
970 4.043,
971 3.570,
972 3.261,
973 3.042,
974 2.877,
975 2.748,
976 2.644,
977 2.557,
978 },
979 {
980 6.967,
981 4.884,
982 4.040,
983 3.566,
984 3.258,
985 3.039,
986 2.874,
987 2.745,
988 2.640,
989 2.554,
990 },
991 {
992 6.963,
993 4.881,
994 4.036,
995 3.563,
996 3.255,
997 3.036,
998 2.871,
999 2.742,
1000 2.637,
1001 2.551,
1002 },
1003 {
1004 6.958,
1005 4.877,
1006 4.033,
1007 3.560,
1008 3.252,
1009 3.033,
1010 2.868,
1011 2.739,
1012 2.634,
1013 2.548,
1014 },
1015 {
1016 6.954,
1017 4.874,
1018 4.030,
1019 3.557,
1020 3.249,
1021 3.030,
1022 2.865,
1023 2.736,
1024 2.632,
1025 2.545,
1026 },
1027 {
1028 6.950,
1029 4.870,
1030 4.027,
1031 3.554,
1032 3.246,
1033 3.027,
1034 2.863,
1035 2.733,
1036 2.629,
1037 2.542,
1038 },
1039 {
1040 6.947,
1041 4.867,
1042 4.024,
1043 3.551,
1044 3.243,
1045 3.025,
1046 2.860,
1047 2.731,
1048 2.626,
1049 2.539,
1050 },
1051 {
1052 6.943,
1053 4.864,
1054 4.021,
1055 3.548,
1056 3.240,
1057 3.022,
1058 2.857,
1059 2.728,
1060 2.623,
1061 2.537,
1062 },
1063 {
1064 6.939,
1065 4.861,
1066 4.018,
1067 3.545,
1068 3.238,
1069 3.019,
1070 2.854,
1071 2.725,
1072 2.621,
1073 2.534,
1074 },
1075 {
1076 6.935,
1077 4.858,
1078 4.015,
1079 3.543,
1080 3.235,
1081 3.017,
1082 2.852,
1083 2.723,
1084 2.618,
1085 2.532,
1086 },
1087 {
1088 6.932,
1089 4.855,
1090 4.012,
1091 3.540,
1092 3.233,
1093 3.014,
1094 2.849,
1095 2.720,
1096 2.616,
1097 2.529,
1098 },
1099 {
1100 6.928,
1101 4.852,
1102 4.010,
1103 3.538,
1104 3.230,
1105 3.012,
1106 2.847,
1107 2.718,
1108 2.613,
1109 2.527,
1110 },
1111 {
1112 6.925,
1113 4.849,
1114 4.007,
1115 3.535,
1116 3.228,
1117 3.009,
1118 2.845,
1119 2.715,
1120 2.611,
1121 2.524,
1122 },
1123 {
1124 6.922,
1125 4.846,
1126 4.004,
1127 3.533,
1128 3.225,
1129 3.007,
1130 2.842,
1131 2.713,
1132 2.609,
1133 2.522,
1134 },
1135 {
1136 6.919,
1137 4.844,
1138 4.002,
1139 3.530,
1140 3.223,
1141 3.004,
1142 2.840,
1143 2.711,
1144 2.606,
1145 2.520,
1146 },
1147 {
1148 6.915,
1149 4.841,
1150 3.999,
1151 3.528,
1152 3.221,
1153 3.002,
1154 2.838,
1155 2.709,
1156 2.604,
1157 2.518,
1158 },
1159 {
1160 6.912,
1161 4.838,
1162 3.997,
1163 3.525,
1164 3.218,
1165 3.000,
1166 2.835,
1167 2.706,
1168 2.602,
1169 2.515,
1170 },
1171 {
1172 6.909,
1173 4.836,
1174 3.995,
1175 3.523,
1176 3.216,
1177 2.998,
1178 2.833,
1179 2.704,
1180 2.600,
1181 2.513,
1182 },
1183 {
1184 6.906,
1185 4.833,
1186 3.992,
1187 3.521,
1188 3.214,
1189 2.996,
1190 2.831,
1191 2.702,
1192 2.598,
1193 2.511,
1194 },
1195 {
1196 6.904,
1197 4.831,
1198 3.990,
1199 3.519,
1200 3.212,
1201 2.994,
1202 2.829,
1203 2.700,
1204 2.596,
1205 2.509,
1206 },
1207 {
1208 6.901,
1209 4.829,
1210 3.988,
1211 3.517,
1212 3.210,
1213 2.992,
1214 2.827,
1215 2.698,
1216 2.594,
1217 2.507,
1218 },
1219 {
1220 6.898,
1221 4.826,
1222 3.986,
1223 3.515,
1224 3.208,
1225 2.990,
1226 2.825,
1227 2.696,
1228 2.592,
1229 2.505,
1230 },
1231 {6.895, 4.824, 3.984, 3.513, 3.206, 2.988, 2.823, 2.694, 2.590, 2.503}};
1232
1233 /** define the variance which will be used as a minimum variance for any
1234 dimension of any feature. Since most features are calculated from numbers
1235 with a precision no better than 1 in 128, the variance should never be
1236 less than the square of this number for parameters whose range is 1. */
1237 #define MINVARIANCE 0.0004
1238
1239 /** define the absolute minimum number of samples which must be present in
1240 order to accurately test hypotheses about underlying probability
1241 distributions. Define separately the minimum samples that are needed
1242 before a statistical analysis is attempted; this number should be
1243 equal to MINSAMPLES but can be set to a lower number for early testing
1244 when very few samples are available. */
1245 #define MINSAMPLESPERBUCKET 5
1246 #define MINSAMPLES (MINBUCKETS * MINSAMPLESPERBUCKET)
1247 #define MINSAMPLESNEEDED 1
1248
1249 /** define the size of the table which maps normalized samples to
1250 histogram buckets. Also define the number of standard deviations
1251 in a normal distribution which are considered to be significant.
1252 The mapping table will be defined in such a way that it covers
1253 the specified number of standard deviations on either side of
1254 the mean. BUCKETTABLESIZE should always be even. */
1255 #define BUCKETTABLESIZE 1024
1256 #define NORMALEXTENT 3.0
1257
1258 struct TEMPCLUSTER {
1259 CLUSTER *Cluster;
1260 CLUSTER *Neighbor;
1261 };
1262
1263 using ClusterPair = tesseract::KDPairInc<float, TEMPCLUSTER *>;
1264 using ClusterHeap = tesseract::GenericHeap<ClusterPair>;
1265
1266 struct STATISTICS {
STATISTICStesseract::STATISTICS1267 STATISTICS(size_t n) : CoVariance(n * n), Min(n), Max(n) {
1268 }
1269 float AvgVariance = 1.0f;
1270 std::vector<float> CoVariance;
1271 std::vector<float> Min; // largest negative distance from the mean
1272 std::vector<float> Max; // largest positive distance from the mean
1273 };
1274
1275 struct BUCKETS {
BUCKETStesseract::BUCKETS1276 BUCKETS(size_t n) : NumberOfBuckets(n), Count(n), ExpectedCount(n) {
1277 }
~BUCKETStesseract::BUCKETS1278 ~BUCKETS() {
1279 }
1280 DISTRIBUTION Distribution = normal; // distribution being tested for
1281 uint32_t SampleCount = 0; // # of samples in histogram
1282 double Confidence = 0.0; // confidence level of test
1283 double ChiSquared = 0.0; // test threshold
1284 uint16_t NumberOfBuckets; // number of cells in histogram
1285 uint16_t Bucket[BUCKETTABLESIZE]; // mapping to histogram buckets
1286 std::vector<uint32_t> Count; // frequency of occurrence histogram
1287 std::vector<float> ExpectedCount; // expected histogram
1288 };
1289
1290 struct CHISTRUCT {
1291 /// This constructor allocates a new data structure which is used
1292 /// to hold a chi-squared value along with its associated
1293 /// number of degrees of freedom and alpha value.
1294 ///
1295 /// @param degreesOfFreedom degrees of freedom for new chi value
1296 /// @param alpha confidence level for new chi value
CHISTRUCTtesseract::CHISTRUCT1297 CHISTRUCT(uint16_t degreesOfFreedom, double alpha) : DegreesOfFreedom(degreesOfFreedom), Alpha(alpha) {
1298 }
1299 uint16_t DegreesOfFreedom = 0;
1300 double Alpha = 0.0;
1301 double ChiSquared = 0.0;
1302 };
1303
1304 // For use with KDWalk / MakePotentialClusters
1305 struct ClusteringContext {
1306 ClusterHeap *heap; // heap used to hold temp clusters, "best" on top
1307 TEMPCLUSTER *candidates; // array of potential clusters
1308 KDTREE *tree; // kd-tree to be searched for neighbors
1309 int32_t next; // next candidate to be used
1310 };
1311
1312 using DENSITYFUNC = double (*)(int32_t);
1313 using SOLVEFUNC = double (*)(CHISTRUCT *, double);
1314
1315 #define Odd(N) ((N) % 2)
1316 #define Mirror(N, R) ((R) - (N)-1)
1317 #define Abs(N) (((N) < 0) ? (-(N)) : (N))
1318
1319 //--------------Global Data Definitions and Declarations----------------------
1320 /** the following variables describe a discrete normal distribution
1321 which is used by NormalDensity() and NormalBucket(). The
1322 constant NORMALEXTENT determines how many standard
1323 deviations of the distribution are mapped onto the fixed
1324 discrete range of x. x=0 is mapped to -NORMALEXTENT standard
1325 deviations and x=BUCKETTABLESIZE is mapped to
1326 +NORMALEXTENT standard deviations. */
1327 #define SqrtOf2Pi 2.506628275
1328 static const double kNormalStdDev = BUCKETTABLESIZE / (2.0 * NORMALEXTENT);
1329 static const double kNormalVariance =
1330 (BUCKETTABLESIZE * BUCKETTABLESIZE) / (4.0 * NORMALEXTENT * NORMALEXTENT);
1331 static const double kNormalMagnitude = (2.0 * NORMALEXTENT) / (SqrtOf2Pi * BUCKETTABLESIZE);
1332 static const double kNormalMean = BUCKETTABLESIZE / 2;
1333
1334 /** define lookup tables used to compute the number of histogram buckets
1335 that should be used for a given number of samples. */
1336 #define LOOKUPTABLESIZE 8
1337 #define MAXDEGREESOFFREEDOM MAXBUCKETS
1338
1339 static const uint32_t kCountTable[LOOKUPTABLESIZE] = {MINSAMPLES, 200, 400, 600, 800,
1340 1000, 1500, 2000}; // number of samples
1341
1342 static const uint16_t kBucketsTable[LOOKUPTABLESIZE] = {
1343 MINBUCKETS, 16, 20, 24, 27, 30, 35, MAXBUCKETS}; // number of buckets
1344
1345 /*-------------------------------------------------------------------------
1346 Private Function Prototypes
1347 --------------------------------------------------------------------------*/
1348 static void CreateClusterTree(CLUSTERER *Clusterer);
1349
1350 static void MakePotentialClusters(ClusteringContext *context, CLUSTER *Cluster, int32_t Level);
1351
1352 static CLUSTER *FindNearestNeighbor(KDTREE *Tree, CLUSTER *Cluster, float *Distance);
1353
1354 static CLUSTER *MakeNewCluster(CLUSTERER *Clusterer, TEMPCLUSTER *TempCluster);
1355
1356 static void ComputePrototypes(CLUSTERER *Clusterer, CLUSTERCONFIG *Config);
1357
1358 static PROTOTYPE *MakePrototype(CLUSTERER *Clusterer, CLUSTERCONFIG *Config, CLUSTER *Cluster);
1359
1360 static PROTOTYPE *MakeDegenerateProto(uint16_t N, CLUSTER *Cluster, STATISTICS *Statistics,
1361 PROTOSTYLE Style, int32_t MinSamples);
1362
1363 static PROTOTYPE *TestEllipticalProto(CLUSTERER *Clusterer, CLUSTERCONFIG *Config, CLUSTER *Cluster,
1364 STATISTICS *Statistics);
1365
1366 static PROTOTYPE *MakeSphericalProto(CLUSTERER *Clusterer, CLUSTER *Cluster, STATISTICS *Statistics,
1367 BUCKETS *Buckets);
1368
1369 static PROTOTYPE *MakeEllipticalProto(CLUSTERER *Clusterer, CLUSTER *Cluster,
1370 STATISTICS *Statistics, BUCKETS *Buckets);
1371
1372 static PROTOTYPE *MakeMixedProto(CLUSTERER *Clusterer, CLUSTER *Cluster, STATISTICS *Statistics,
1373 BUCKETS *NormalBuckets, double Confidence);
1374
1375 static void MakeDimRandom(uint16_t i, PROTOTYPE *Proto, PARAM_DESC *ParamDesc);
1376
1377 static void MakeDimUniform(uint16_t i, PROTOTYPE *Proto, STATISTICS *Statistics);
1378
1379 static STATISTICS *ComputeStatistics(int16_t N, PARAM_DESC ParamDesc[], CLUSTER *Cluster);
1380
1381 static PROTOTYPE *NewSphericalProto(uint16_t N, CLUSTER *Cluster, STATISTICS *Statistics);
1382
1383 static PROTOTYPE *NewEllipticalProto(int16_t N, CLUSTER *Cluster, STATISTICS *Statistics);
1384
1385 static PROTOTYPE *NewMixedProto(int16_t N, CLUSTER *Cluster, STATISTICS *Statistics);
1386
1387 static PROTOTYPE *NewSimpleProto(int16_t N, CLUSTER *Cluster);
1388
1389 static bool Independent(PARAM_DESC *ParamDesc, int16_t N, float *CoVariance, float Independence);
1390
1391 static BUCKETS *GetBuckets(CLUSTERER *clusterer, DISTRIBUTION Distribution, uint32_t SampleCount,
1392 double Confidence);
1393
1394 static BUCKETS *MakeBuckets(DISTRIBUTION Distribution, uint32_t SampleCount, double Confidence);
1395
1396 static uint16_t OptimumNumberOfBuckets(uint32_t SampleCount);
1397
1398 static double ComputeChiSquared(uint16_t DegreesOfFreedom, double Alpha);
1399
1400 static double NormalDensity(int32_t x);
1401
1402 static double UniformDensity(int32_t x);
1403
1404 static double Integral(double f1, double f2, double Dx);
1405
1406 static void FillBuckets(BUCKETS *Buckets, CLUSTER *Cluster, uint16_t Dim, PARAM_DESC *ParamDesc,
1407 float Mean, float StdDev);
1408
1409 static uint16_t NormalBucket(PARAM_DESC *ParamDesc, float x, float Mean, float StdDev);
1410
1411 static uint16_t UniformBucket(PARAM_DESC *ParamDesc, float x, float Mean, float StdDev);
1412
1413 static bool DistributionOK(BUCKETS *Buckets);
1414
1415 static uint16_t DegreesOfFreedom(DISTRIBUTION Distribution, uint16_t HistogramBuckets);
1416
1417 static void AdjustBuckets(BUCKETS *Buckets, uint32_t NewSampleCount);
1418
1419 static void InitBuckets(BUCKETS *Buckets);
1420
1421 static int AlphaMatch(void *arg1, // CHISTRUCT *ChiStruct,
1422 void *arg2); // CHISTRUCT *SearchKey);
1423
1424 static double Solve(SOLVEFUNC Function, void *FunctionParams, double InitialGuess, double Accuracy);
1425
1426 static double ChiArea(CHISTRUCT *ChiParams, double x);
1427
1428 static bool MultipleCharSamples(CLUSTERER *Clusterer, CLUSTER *Cluster, float MaxIllegal);
1429
1430 static double InvertMatrix(const float *input, int size, float *inv);
1431
1432 //--------------------------Public Code--------------------------------------
1433 /**
1434 * This routine creates a new clusterer data structure,
1435 * initializes it, and returns a pointer to it.
1436 *
1437 * @param SampleSize number of dimensions in feature space
1438 * @param ParamDesc description of each dimension
1439 * @return pointer to the new clusterer data structure
1440 */
MakeClusterer(int16_t SampleSize,const PARAM_DESC ParamDesc[])1441 CLUSTERER *MakeClusterer(int16_t SampleSize, const PARAM_DESC ParamDesc[]) {
1442 int i;
1443
1444 // allocate main clusterer data structure and init simple fields
1445 auto Clusterer = new CLUSTERER;
1446 Clusterer->SampleSize = SampleSize;
1447 Clusterer->NumberOfSamples = 0;
1448 Clusterer->NumChar = 0;
1449
1450 // init fields which will not be used initially
1451 Clusterer->Root = nullptr;
1452 Clusterer->ProtoList = NIL_LIST;
1453
1454 // maintain a copy of param descriptors in the clusterer data structure
1455 Clusterer->ParamDesc = new PARAM_DESC[SampleSize];
1456 for (i = 0; i < SampleSize; i++) {
1457 Clusterer->ParamDesc[i].Circular = ParamDesc[i].Circular;
1458 Clusterer->ParamDesc[i].NonEssential = ParamDesc[i].NonEssential;
1459 Clusterer->ParamDesc[i].Min = ParamDesc[i].Min;
1460 Clusterer->ParamDesc[i].Max = ParamDesc[i].Max;
1461 Clusterer->ParamDesc[i].Range = ParamDesc[i].Max - ParamDesc[i].Min;
1462 Clusterer->ParamDesc[i].HalfRange = Clusterer->ParamDesc[i].Range / 2;
1463 Clusterer->ParamDesc[i].MidRange = (ParamDesc[i].Max + ParamDesc[i].Min) / 2;
1464 }
1465
1466 // allocate a kd tree to hold the samples
1467 Clusterer->KDTree = MakeKDTree(SampleSize, ParamDesc);
1468
1469 // Initialize cache of histogram buckets to minimize recomputing them.
1470 for (auto &d : Clusterer->bucket_cache) {
1471 for (auto &c : d) {
1472 c = nullptr;
1473 }
1474 }
1475
1476 return Clusterer;
1477 } // MakeClusterer
1478
1479 /**
1480 * This routine creates a new sample data structure to hold
1481 * the specified feature. This sample is added to the clusterer
1482 * data structure (so that it knows which samples are to be
1483 * clustered later), and a pointer to the sample is returned to
1484 * the caller.
1485 *
1486 * @param Clusterer clusterer data structure to add sample to
1487 * @param Feature feature to be added to clusterer
1488 * @param CharID unique ident. of char that sample came from
1489 *
1490 * @return Pointer to the new sample data structure
1491 */
MakeSample(CLUSTERER * Clusterer,const float * Feature,uint32_t CharID)1492 SAMPLE *MakeSample(CLUSTERER *Clusterer, const float *Feature, uint32_t CharID) {
1493 int i;
1494
1495 // see if the samples have already been clustered - if so trap an error
1496 // Can't add samples after they have been clustered.
1497 ASSERT_HOST(Clusterer->Root == nullptr);
1498
1499 // allocate the new sample and initialize it
1500 auto Sample = new SAMPLE(Clusterer->SampleSize);
1501 Sample->Clustered = false;
1502 Sample->Prototype = false;
1503 Sample->SampleCount = 1;
1504 Sample->Left = nullptr;
1505 Sample->Right = nullptr;
1506 Sample->CharID = CharID;
1507
1508 for (i = 0; i < Clusterer->SampleSize; i++) {
1509 Sample->Mean[i] = Feature[i];
1510 }
1511
1512 // add the sample to the KD tree - keep track of the total # of samples
1513 Clusterer->NumberOfSamples++;
1514 KDStore(Clusterer->KDTree, &Sample->Mean[0], Sample);
1515 if (CharID >= Clusterer->NumChar) {
1516 Clusterer->NumChar = CharID + 1;
1517 }
1518
1519 // execute hook for monitoring clustering operation
1520 // (*SampleCreationHook)(Sample);
1521
1522 return (Sample);
1523 } // MakeSample
1524
1525 /**
1526 * This routine first checks to see if the samples in this
1527 * clusterer have already been clustered before; if so, it does
1528 * not bother to recreate the cluster tree. It simply recomputes
1529 * the prototypes based on the new Config info.
1530 *
1531 * If the samples have not been clustered before, the
1532 * samples in the KD tree are formed into a cluster tree and then
1533 * the prototypes are computed from the cluster tree.
1534 *
1535 * In either case this routine returns a pointer to a
1536 * list of prototypes that best represent the samples given
1537 * the constraints specified in Config.
1538 *
1539 * @param Clusterer data struct containing samples to be clustered
1540 * @param Config parameters which control clustering process
1541 *
1542 * @return Pointer to a list of prototypes
1543 */
ClusterSamples(CLUSTERER * Clusterer,CLUSTERCONFIG * Config)1544 LIST ClusterSamples(CLUSTERER *Clusterer, CLUSTERCONFIG *Config) {
1545 // only create cluster tree if samples have never been clustered before
1546 if (Clusterer->Root == nullptr) {
1547 CreateClusterTree(Clusterer);
1548 }
1549
1550 // deallocate the old prototype list if one exists
1551 FreeProtoList(&Clusterer->ProtoList);
1552 Clusterer->ProtoList = NIL_LIST;
1553
1554 // compute prototypes starting at the root node in the tree
1555 ComputePrototypes(Clusterer, Config);
1556 // We don't need the cluster pointers in the protos any more, so null them
1557 // out, which makes it safe to delete the clusterer.
1558 LIST proto_list = Clusterer->ProtoList;
1559 iterate(proto_list) {
1560 auto *proto = reinterpret_cast<PROTOTYPE *>(proto_list->first_node());
1561 proto->Cluster = nullptr;
1562 }
1563 return Clusterer->ProtoList;
1564 } // ClusterSamples
1565
1566 /**
1567 * This routine frees all of the memory allocated to the
1568 * specified data structure. It will not, however, free
1569 * the memory used by the prototype list. The pointers to
1570 * the clusters for each prototype in the list will be set
1571 * to nullptr to indicate that the cluster data structures no
1572 * longer exist. Any sample lists that have been obtained
1573 * via calls to GetSamples are no longer valid.
1574 * @param Clusterer pointer to data structure to be freed
1575 */
FreeClusterer(CLUSTERER * Clusterer)1576 void FreeClusterer(CLUSTERER *Clusterer) {
1577 if (Clusterer != nullptr) {
1578 delete[] Clusterer->ParamDesc;
1579 delete Clusterer->KDTree;
1580 delete Clusterer->Root;
1581 // Free up all used buckets structures.
1582 for (auto &d : Clusterer->bucket_cache) {
1583 for (auto &c : d) {
1584 delete c;
1585 }
1586 }
1587
1588 delete Clusterer;
1589 }
1590 } // FreeClusterer
1591
1592 /**
1593 * This routine frees all of the memory allocated to the
1594 * specified list of prototypes. The clusters which are
1595 * pointed to by the prototypes are not freed.
1596 * @param ProtoList pointer to list of prototypes to be freed
1597 */
FreeProtoList(LIST * ProtoList)1598 void FreeProtoList(LIST *ProtoList) {
1599 destroy_nodes(*ProtoList, FreePrototype);
1600 } // FreeProtoList
1601
1602 /**
1603 * This routine deallocates the memory consumed by the specified
1604 * prototype and modifies the corresponding cluster so that it
1605 * is no longer marked as a prototype. The cluster is NOT
1606 * deallocated by this routine.
1607 * @param arg prototype data structure to be deallocated
1608 */
FreePrototype(void * arg)1609 void FreePrototype(void *arg) { // PROTOTYPE *Prototype)
1610 auto *Prototype = static_cast<PROTOTYPE *>(arg);
1611
1612 // unmark the corresponding cluster (if there is one
1613 if (Prototype->Cluster != nullptr) {
1614 Prototype->Cluster->Prototype = false;
1615 }
1616
1617 // deallocate the prototype statistics and then the prototype itself
1618 if (Prototype->Style != spherical) {
1619 delete[] Prototype->Variance.Elliptical;
1620 delete[] Prototype->Magnitude.Elliptical;
1621 delete[] Prototype->Weight.Elliptical;
1622 }
1623 delete Prototype;
1624 } // FreePrototype
1625
1626 /**
1627 * This routine is used to find all of the samples which
1628 * belong to a cluster. It starts by removing the top
1629 * cluster on the cluster list (SearchState). If this cluster is
1630 * a leaf it is returned. Otherwise, the right subcluster
1631 * is pushed on the list and we continue the search in the
1632 * left subcluster. This continues until a leaf is found.
1633 * If all samples have been found, nullptr is returned.
1634 * InitSampleSearch() must be called
1635 * before NextSample() to initialize the search.
1636 * @param SearchState ptr to list containing clusters to be searched
1637 * @return Pointer to the next leaf cluster (sample) or nullptr.
1638 */
NextSample(LIST * SearchState)1639 CLUSTER *NextSample(LIST *SearchState) {
1640 CLUSTER *Cluster;
1641
1642 if (*SearchState == NIL_LIST) {
1643 return (nullptr);
1644 }
1645 Cluster = reinterpret_cast<CLUSTER *>((*SearchState)->first_node());
1646 *SearchState = pop(*SearchState);
1647 for (;;) {
1648 if (Cluster->Left == nullptr) {
1649 return (Cluster);
1650 }
1651 *SearchState = push(*SearchState, Cluster->Right);
1652 Cluster = Cluster->Left;
1653 }
1654 } // NextSample
1655
1656 /**
1657 * This routine returns the mean of the specified
1658 * prototype in the indicated dimension.
1659 * @param Proto prototype to return mean of
1660 * @param Dimension dimension whose mean is to be returned
1661 * @return Mean of Prototype in Dimension
1662 */
Mean(PROTOTYPE * Proto,uint16_t Dimension)1663 float Mean(PROTOTYPE *Proto, uint16_t Dimension) {
1664 return (Proto->Mean[Dimension]);
1665 } // Mean
1666
1667 /**
1668 * This routine returns the standard deviation of the
1669 * prototype in the indicated dimension.
1670 * @param Proto prototype to return standard deviation of
1671 * @param Dimension dimension whose stddev is to be returned
1672 * @return Standard deviation of Prototype in Dimension
1673 */
StandardDeviation(PROTOTYPE * Proto,uint16_t Dimension)1674 float StandardDeviation(PROTOTYPE *Proto, uint16_t Dimension) {
1675 switch (Proto->Style) {
1676 case spherical:
1677 return std::sqrt(Proto->Variance.Spherical);
1678 case elliptical:
1679 return std::sqrt(Proto->Variance.Elliptical[Dimension]);
1680 case mixed:
1681 switch (Proto->Distrib[Dimension]) {
1682 case normal:
1683 return std::sqrt(Proto->Variance.Elliptical[Dimension]);
1684 case uniform:
1685 case D_random:
1686 return Proto->Variance.Elliptical[Dimension];
1687 case DISTRIBUTION_COUNT:
1688 ASSERT_HOST(!"Distribution count not allowed!");
1689 }
1690 }
1691 return 0.0f;
1692 } // StandardDeviation
1693
1694 /*---------------------------------------------------------------------------
1695 Private Code
1696 ----------------------------------------------------------------------------*/
1697 /**
1698 * This routine performs a bottoms-up clustering on the samples
1699 * held in the kd-tree of the Clusterer data structure. The
1700 * result is a cluster tree. Each node in the tree represents
1701 * a cluster which conceptually contains a subset of the samples.
1702 * More precisely, the cluster contains all of the samples which
1703 * are contained in its two sub-clusters. The leaves of the
1704 * tree are the individual samples themselves; they have no
1705 * sub-clusters. The root node of the tree conceptually contains
1706 * all of the samples.
1707 * The Clusterer data structure is changed.
1708 * @param Clusterer data structure holdings samples to be clustered
1709 */
CreateClusterTree(CLUSTERER * Clusterer)1710 static void CreateClusterTree(CLUSTERER *Clusterer) {
1711 ClusteringContext context;
1712 ClusterPair HeapEntry;
1713
1714 // each sample and its nearest neighbor form a "potential" cluster
1715 // save these in a heap with the "best" potential clusters on top
1716 context.tree = Clusterer->KDTree;
1717 context.candidates = new TEMPCLUSTER[Clusterer->NumberOfSamples];
1718 context.next = 0;
1719 context.heap = new ClusterHeap(Clusterer->NumberOfSamples);
1720 KDWalk(context.tree, reinterpret_cast<void_proc>(MakePotentialClusters), &context);
1721
1722 // form potential clusters into actual clusters - always do "best" first
1723 while (context.heap->Pop(&HeapEntry)) {
1724 TEMPCLUSTER *PotentialCluster = HeapEntry.data();
1725
1726 // if main cluster of potential cluster is already in another cluster
1727 // then we don't need to worry about it
1728 if (PotentialCluster->Cluster->Clustered) {
1729 continue;
1730 }
1731
1732 // if main cluster is not yet clustered, but its nearest neighbor is
1733 // then we must find a new nearest neighbor
1734 else if (PotentialCluster->Neighbor->Clustered) {
1735 PotentialCluster->Neighbor =
1736 FindNearestNeighbor(context.tree, PotentialCluster->Cluster, &HeapEntry.key());
1737 if (PotentialCluster->Neighbor != nullptr) {
1738 context.heap->Push(&HeapEntry);
1739 }
1740 }
1741
1742 // if neither cluster is already clustered, form permanent cluster
1743 else {
1744 PotentialCluster->Cluster = MakeNewCluster(Clusterer, PotentialCluster);
1745 PotentialCluster->Neighbor =
1746 FindNearestNeighbor(context.tree, PotentialCluster->Cluster, &HeapEntry.key());
1747 if (PotentialCluster->Neighbor != nullptr) {
1748 context.heap->Push(&HeapEntry);
1749 }
1750 }
1751 }
1752
1753 // the root node in the cluster tree is now the only node in the kd-tree
1754 Clusterer->Root = static_cast<CLUSTER *> RootOf(Clusterer->KDTree);
1755
1756 // free up the memory used by the K-D tree, heap, and temp clusters
1757 delete context.tree;
1758 Clusterer->KDTree = nullptr;
1759 delete context.heap;
1760 delete[] context.candidates;
1761 } // CreateClusterTree
1762
1763 /**
1764 * This routine is designed to be used in concert with the
1765 * KDWalk routine. It will create a potential cluster for
1766 * each sample in the kd-tree that is being walked. This
1767 * potential cluster will then be pushed on the heap.
1768 * @param context ClusteringContext (see definition above)
1769 * @param Cluster current cluster being visited in kd-tree walk
1770 * @param Level level of this cluster in the kd-tree
1771 */
MakePotentialClusters(ClusteringContext * context,CLUSTER * Cluster,int32_t)1772 static void MakePotentialClusters(ClusteringContext *context, CLUSTER *Cluster, int32_t /*Level*/) {
1773 ClusterPair HeapEntry;
1774 int next = context->next;
1775 context->candidates[next].Cluster = Cluster;
1776 HeapEntry.data() = &(context->candidates[next]);
1777 context->candidates[next].Neighbor =
1778 FindNearestNeighbor(context->tree, context->candidates[next].Cluster, &HeapEntry.key());
1779 if (context->candidates[next].Neighbor != nullptr) {
1780 context->heap->Push(&HeapEntry);
1781 context->next++;
1782 }
1783 } // MakePotentialClusters
1784
1785 /**
1786 * This routine searches the specified kd-tree for the nearest
1787 * neighbor of the specified cluster. It actually uses the
1788 * kd routines to find the 2 nearest neighbors since one of them
1789 * will be the original cluster. A pointer to the nearest
1790 * neighbor is returned, if it can be found, otherwise nullptr is
1791 * returned. The distance between the 2 nodes is placed
1792 * in the specified variable.
1793 * @param Tree kd-tree to search in for nearest neighbor
1794 * @param Cluster cluster whose nearest neighbor is to be found
1795 * @param Distance ptr to variable to report distance found
1796 * @return Pointer to the nearest neighbor of Cluster, or nullptr
1797 */
FindNearestNeighbor(KDTREE * Tree,CLUSTER * Cluster,float * Distance)1798 static CLUSTER *FindNearestNeighbor(KDTREE *Tree, CLUSTER *Cluster, float *Distance)
1799 #define MAXNEIGHBORS 2
1800 #define MAXDISTANCE FLT_MAX
1801 {
1802 CLUSTER *Neighbor[MAXNEIGHBORS];
1803 float Dist[MAXNEIGHBORS];
1804 int NumberOfNeighbors;
1805 int32_t i;
1806 CLUSTER *BestNeighbor;
1807
1808 // find the 2 nearest neighbors of the cluster
1809 KDNearestNeighborSearch(Tree, &Cluster->Mean[0], MAXNEIGHBORS, MAXDISTANCE, &NumberOfNeighbors,
1810 reinterpret_cast<void **>(Neighbor), Dist);
1811
1812 // search for the nearest neighbor that is not the cluster itself
1813 *Distance = MAXDISTANCE;
1814 BestNeighbor = nullptr;
1815 for (i = 0; i < NumberOfNeighbors; i++) {
1816 if ((Dist[i] < *Distance) && (Neighbor[i] != Cluster)) {
1817 *Distance = Dist[i];
1818 BestNeighbor = Neighbor[i];
1819 }
1820 }
1821 return BestNeighbor;
1822 } // FindNearestNeighbor
1823
1824 /**
1825 * This routine creates a new permanent cluster from the
1826 * clusters specified in TempCluster. The 2 clusters in
1827 * TempCluster are marked as "clustered" and deleted from
1828 * the kd-tree. The new cluster is then added to the kd-tree.
1829 * @param Clusterer current clustering environment
1830 * @param TempCluster potential cluster to make permanent
1831 * @return Pointer to the new permanent cluster
1832 */
MakeNewCluster(CLUSTERER * Clusterer,TEMPCLUSTER * TempCluster)1833 static CLUSTER *MakeNewCluster(CLUSTERER *Clusterer, TEMPCLUSTER *TempCluster) {
1834 // allocate the new cluster and initialize it
1835 auto Cluster = new CLUSTER(Clusterer->SampleSize);
1836 Cluster->Clustered = false;
1837 Cluster->Prototype = false;
1838 Cluster->Left = TempCluster->Cluster;
1839 Cluster->Right = TempCluster->Neighbor;
1840 Cluster->CharID = -1;
1841
1842 // mark the old clusters as "clustered" and delete them from the kd-tree
1843 Cluster->Left->Clustered = true;
1844 Cluster->Right->Clustered = true;
1845 KDDelete(Clusterer->KDTree, &Cluster->Left->Mean[0], Cluster->Left);
1846 KDDelete(Clusterer->KDTree, &Cluster->Right->Mean[0], Cluster->Right);
1847
1848 // compute the mean and sample count for the new cluster
1849 Cluster->SampleCount = MergeClusters(Clusterer->SampleSize, Clusterer->ParamDesc,
1850 Cluster->Left->SampleCount, Cluster->Right->SampleCount,
1851 &Cluster->Mean[0], &Cluster->Left->Mean[0], &Cluster->Right->Mean[0]);
1852
1853 // add the new cluster to the KD tree
1854 KDStore(Clusterer->KDTree, &Cluster->Mean[0], Cluster);
1855 return Cluster;
1856 } // MakeNewCluster
1857
1858 /**
1859 * This routine merges two clusters into one larger cluster.
1860 * To do this it computes the number of samples in the new
1861 * cluster and the mean of the new cluster. The ParamDesc
1862 * information is used to ensure that circular dimensions
1863 * are handled correctly.
1864 * @param N # of dimensions (size of arrays)
1865 * @param ParamDesc array of dimension descriptions
1866 * @param n1, n2 number of samples in each old cluster
1867 * @param m array to hold mean of new cluster
1868 * @param m1, m2 arrays containing means of old clusters
1869 * @return The number of samples in the new cluster.
1870 */
MergeClusters(int16_t N,PARAM_DESC ParamDesc[],int32_t n1,int32_t n2,float m[],float m1[],float m2[])1871 int32_t MergeClusters(int16_t N, PARAM_DESC ParamDesc[], int32_t n1, int32_t n2, float m[],
1872 float m1[], float m2[]) {
1873 int32_t i, n;
1874
1875 n = n1 + n2;
1876 for (i = N; i > 0; i--, ParamDesc++, m++, m1++, m2++) {
1877 if (ParamDesc->Circular) {
1878 // if distance between means is greater than allowed
1879 // reduce upper point by one "rotation" to compute mean
1880 // then normalize the mean back into the accepted range
1881 if ((*m2 - *m1) > ParamDesc->HalfRange) {
1882 *m = (n1 * *m1 + n2 * (*m2 - ParamDesc->Range)) / n;
1883 if (*m < ParamDesc->Min) {
1884 *m += ParamDesc->Range;
1885 }
1886 } else if ((*m1 - *m2) > ParamDesc->HalfRange) {
1887 *m = (n1 * (*m1 - ParamDesc->Range) + n2 * *m2) / n;
1888 if (*m < ParamDesc->Min) {
1889 *m += ParamDesc->Range;
1890 }
1891 } else {
1892 *m = (n1 * *m1 + n2 * *m2) / n;
1893 }
1894 } else {
1895 *m = (n1 * *m1 + n2 * *m2) / n;
1896 }
1897 }
1898 return n;
1899 } // MergeClusters
1900
1901 /**
1902 * This routine decides which clusters in the cluster tree
1903 * should be represented by prototypes, forms a list of these
1904 * prototypes, and places the list in the Clusterer data
1905 * structure.
1906 * @param Clusterer data structure holding cluster tree
1907 * @param Config parameters used to control prototype generation
1908 */
ComputePrototypes(CLUSTERER * Clusterer,CLUSTERCONFIG * Config)1909 static void ComputePrototypes(CLUSTERER *Clusterer, CLUSTERCONFIG *Config) {
1910 LIST ClusterStack = NIL_LIST;
1911 CLUSTER *Cluster;
1912 PROTOTYPE *Prototype;
1913
1914 // use a stack to keep track of clusters waiting to be processed
1915 // initially the only cluster on the stack is the root cluster
1916 if (Clusterer->Root != nullptr) {
1917 ClusterStack = push(NIL_LIST, Clusterer->Root);
1918 }
1919
1920 // loop until we have analyzed all clusters which are potential prototypes
1921 while (ClusterStack != NIL_LIST) {
1922 // remove the next cluster to be analyzed from the stack
1923 // try to make a prototype from the cluster
1924 // if successful, put it on the proto list, else split the cluster
1925 Cluster = reinterpret_cast<CLUSTER *>(ClusterStack->first_node());
1926 ClusterStack = pop(ClusterStack);
1927 Prototype = MakePrototype(Clusterer, Config, Cluster);
1928 if (Prototype != nullptr) {
1929 Clusterer->ProtoList = push(Clusterer->ProtoList, Prototype);
1930 } else {
1931 ClusterStack = push(ClusterStack, Cluster->Right);
1932 ClusterStack = push(ClusterStack, Cluster->Left);
1933 }
1934 }
1935 } // ComputePrototypes
1936
1937 /**
1938 * This routine attempts to create a prototype from the
1939 * specified cluster that conforms to the distribution
1940 * specified in Config. If there are too few samples in the
1941 * cluster to perform a statistical analysis, then a prototype
1942 * is generated but labelled as insignificant. If the
1943 * dimensions of the cluster are not independent, no prototype
1944 * is generated and nullptr is returned. If a prototype can be
1945 * found that matches the desired distribution then a pointer
1946 * to it is returned, otherwise nullptr is returned.
1947 * @param Clusterer data structure holding cluster tree
1948 * @param Config parameters used to control prototype generation
1949 * @param Cluster cluster to be made into a prototype
1950 * @return Pointer to new prototype or nullptr
1951 */
MakePrototype(CLUSTERER * Clusterer,CLUSTERCONFIG * Config,CLUSTER * Cluster)1952 static PROTOTYPE *MakePrototype(CLUSTERER *Clusterer, CLUSTERCONFIG *Config, CLUSTER *Cluster) {
1953 PROTOTYPE *Proto;
1954 BUCKETS *Buckets;
1955
1956 // filter out clusters which contain samples from the same character
1957 if (MultipleCharSamples(Clusterer, Cluster, Config->MaxIllegal)) {
1958 return nullptr;
1959 }
1960
1961 // compute the covariance matrix and ranges for the cluster
1962 auto Statistics = ComputeStatistics(Clusterer->SampleSize, Clusterer->ParamDesc, Cluster);
1963
1964 // check for degenerate clusters which need not be analyzed further
1965 // note that the MinSamples test assumes that all clusters with multiple
1966 // character samples have been removed (as above)
1967 Proto = MakeDegenerateProto(Clusterer->SampleSize, Cluster, Statistics, Config->ProtoStyle,
1968 static_cast<int32_t>(Config->MinSamples * Clusterer->NumChar));
1969 if (Proto != nullptr) {
1970 delete Statistics;
1971 return Proto;
1972 }
1973 // check to ensure that all dimensions are independent
1974 if (!Independent(Clusterer->ParamDesc, Clusterer->SampleSize, &Statistics->CoVariance[0],
1975 Config->Independence)) {
1976 delete Statistics;
1977 return nullptr;
1978 }
1979
1980 if (HOTELLING && Config->ProtoStyle == elliptical) {
1981 Proto = TestEllipticalProto(Clusterer, Config, Cluster, Statistics);
1982 if (Proto != nullptr) {
1983 delete Statistics;
1984 return Proto;
1985 }
1986 }
1987
1988 // create a histogram data structure used to evaluate distributions
1989 Buckets = GetBuckets(Clusterer, normal, Cluster->SampleCount, Config->Confidence);
1990
1991 // create a prototype based on the statistics and test it
1992 switch (Config->ProtoStyle) {
1993 case spherical:
1994 Proto = MakeSphericalProto(Clusterer, Cluster, Statistics, Buckets);
1995 break;
1996 case elliptical:
1997 Proto = MakeEllipticalProto(Clusterer, Cluster, Statistics, Buckets);
1998 break;
1999 case mixed:
2000 Proto = MakeMixedProto(Clusterer, Cluster, Statistics, Buckets, Config->Confidence);
2001 break;
2002 case automatic:
2003 Proto = MakeSphericalProto(Clusterer, Cluster, Statistics, Buckets);
2004 if (Proto != nullptr) {
2005 break;
2006 }
2007 Proto = MakeEllipticalProto(Clusterer, Cluster, Statistics, Buckets);
2008 if (Proto != nullptr) {
2009 break;
2010 }
2011 Proto = MakeMixedProto(Clusterer, Cluster, Statistics, Buckets, Config->Confidence);
2012 break;
2013 }
2014 delete Statistics;
2015 return Proto;
2016 } // MakePrototype
2017
2018 /**
2019 * This routine checks for clusters which are degenerate and
2020 * therefore cannot be analyzed in a statistically valid way.
2021 * A cluster is defined as degenerate if it does not have at
2022 * least MINSAMPLESNEEDED samples in it. If the cluster is
2023 * found to be degenerate, a prototype of the specified style
2024 * is generated and marked as insignificant. A cluster is
2025 * also degenerate if it does not have at least MinSamples
2026 * samples in it.
2027 *
2028 * If the cluster is not degenerate, nullptr is returned.
2029 *
2030 * @param N number of dimensions
2031 * @param Cluster cluster being analyzed
2032 * @param Statistics statistical info about cluster
2033 * @param Style type of prototype to be generated
2034 * @param MinSamples minimum number of samples in a cluster
2035 * @return Pointer to degenerate prototype or nullptr.
2036 */
MakeDegenerateProto(uint16_t N,CLUSTER * Cluster,STATISTICS * Statistics,PROTOSTYLE Style,int32_t MinSamples)2037 static PROTOTYPE *MakeDegenerateProto( // this was MinSample
2038 uint16_t N, CLUSTER *Cluster, STATISTICS *Statistics, PROTOSTYLE Style, int32_t MinSamples) {
2039 PROTOTYPE *Proto = nullptr;
2040
2041 if (MinSamples < MINSAMPLESNEEDED) {
2042 MinSamples = MINSAMPLESNEEDED;
2043 }
2044
2045 if (Cluster->SampleCount < MinSamples) {
2046 switch (Style) {
2047 case spherical:
2048 Proto = NewSphericalProto(N, Cluster, Statistics);
2049 break;
2050 case elliptical:
2051 case automatic:
2052 Proto = NewEllipticalProto(N, Cluster, Statistics);
2053 break;
2054 case mixed:
2055 Proto = NewMixedProto(N, Cluster, Statistics);
2056 break;
2057 }
2058 Proto->Significant = false;
2059 }
2060 return (Proto);
2061 } // MakeDegenerateProto
2062
2063 /**
2064 * This routine tests the specified cluster to see if **
2065 * there is a statistically significant difference between
2066 * the sub-clusters that would be made if the cluster were to
2067 * be split. If not, then a new prototype is formed and
2068 * returned to the caller. If there is, then nullptr is returned
2069 * to the caller.
2070 * @param Clusterer data struct containing samples being clustered
2071 * @param Config provides the magic number of samples that make a good cluster
2072 * @param Cluster cluster to be made into an elliptical prototype
2073 * @param Statistics statistical info about cluster
2074 * @return Pointer to new elliptical prototype or nullptr.
2075 */
TestEllipticalProto(CLUSTERER * Clusterer,CLUSTERCONFIG * Config,CLUSTER * Cluster,STATISTICS * Statistics)2076 static PROTOTYPE *TestEllipticalProto(CLUSTERER *Clusterer, CLUSTERCONFIG *Config, CLUSTER *Cluster,
2077 STATISTICS *Statistics) {
2078 // Fraction of the number of samples used as a range around 1 within
2079 // which a cluster has the magic size that allows a boost to the
2080 // FTable by kFTableBoostMargin, thus allowing clusters near the
2081 // magic size (equal to the number of sample characters) to be more
2082 // likely to stay together.
2083 const double kMagicSampleMargin = 0.0625;
2084 const double kFTableBoostMargin = 2.0;
2085
2086 int N = Clusterer->SampleSize;
2087 CLUSTER *Left = Cluster->Left;
2088 CLUSTER *Right = Cluster->Right;
2089 if (Left == nullptr || Right == nullptr) {
2090 return nullptr;
2091 }
2092 int TotalDims = Left->SampleCount + Right->SampleCount;
2093 if (TotalDims < N + 1 || TotalDims < 2) {
2094 return nullptr;
2095 }
2096 std::vector<float> Covariance(static_cast<size_t>(N) * N);
2097 std::vector<float> Inverse(static_cast<size_t>(N) * N);
2098 std::vector<float> Delta(N);
2099 // Compute a new covariance matrix that only uses essential features.
2100 for (int i = 0; i < N; ++i) {
2101 int row_offset = i * N;
2102 if (!Clusterer->ParamDesc[i].NonEssential) {
2103 for (int j = 0; j < N; ++j) {
2104 if (!Clusterer->ParamDesc[j].NonEssential) {
2105 Covariance[j + row_offset] = Statistics->CoVariance[j + row_offset];
2106 } else {
2107 Covariance[j + row_offset] = 0.0f;
2108 }
2109 }
2110 } else {
2111 for (int j = 0; j < N; ++j) {
2112 if (i == j) {
2113 Covariance[j + row_offset] = 1.0f;
2114 } else {
2115 Covariance[j + row_offset] = 0.0f;
2116 }
2117 }
2118 }
2119 }
2120 double err = InvertMatrix(&Covariance[0], N, &Inverse[0]);
2121 if (err > 1) {
2122 tprintf("Clustering error: Matrix inverse failed with error %g\n", err);
2123 }
2124 int EssentialN = 0;
2125 for (int dim = 0; dim < N; ++dim) {
2126 if (!Clusterer->ParamDesc[dim].NonEssential) {
2127 Delta[dim] = Left->Mean[dim] - Right->Mean[dim];
2128 ++EssentialN;
2129 } else {
2130 Delta[dim] = 0.0f;
2131 }
2132 }
2133 // Compute Hotelling's T-squared.
2134 double Tsq = 0.0;
2135 for (int x = 0; x < N; ++x) {
2136 double temp = 0.0;
2137 for (int y = 0; y < N; ++y) {
2138 temp += static_cast<double>(Inverse[y + N * x]) * Delta[y];
2139 }
2140 Tsq += Delta[x] * temp;
2141 }
2142 // Changed this function to match the formula in
2143 // Statistical Methods in Medical Research p 473
2144 // By Peter Armitage, Geoffrey Berry, J. N. S. Matthews.
2145 // Tsq *= Left->SampleCount * Right->SampleCount / TotalDims;
2146 double F = Tsq * (TotalDims - EssentialN - 1) / ((TotalDims - 2) * EssentialN);
2147 int Fx = EssentialN;
2148 if (Fx > FTABLE_X) {
2149 Fx = FTABLE_X;
2150 }
2151 --Fx;
2152 int Fy = TotalDims - EssentialN - 1;
2153 if (Fy > FTABLE_Y) {
2154 Fy = FTABLE_Y;
2155 }
2156 --Fy;
2157 double FTarget = FTable[Fy][Fx];
2158 if (Config->MagicSamples > 0 && TotalDims >= Config->MagicSamples * (1.0 - kMagicSampleMargin) &&
2159 TotalDims <= Config->MagicSamples * (1.0 + kMagicSampleMargin)) {
2160 // Give magic-sized clusters a magic FTable boost.
2161 FTarget += kFTableBoostMargin;
2162 }
2163 if (F < FTarget) {
2164 return NewEllipticalProto(Clusterer->SampleSize, Cluster, Statistics);
2165 }
2166 return nullptr;
2167 }
2168
2169 /**
2170 * This routine tests the specified cluster to see if it can
2171 * be approximated by a spherical normal distribution. If it
2172 * can be, then a new prototype is formed and returned to the
2173 * caller. If it can't be, then nullptr is returned to the caller.
2174 * @param Clusterer data struct containing samples being clustered
2175 * @param Cluster cluster to be made into a spherical prototype
2176 * @param Statistics statistical info about cluster
2177 * @param Buckets histogram struct used to analyze distribution
2178 * @return Pointer to new spherical prototype or nullptr.
2179 */
MakeSphericalProto(CLUSTERER * Clusterer,CLUSTER * Cluster,STATISTICS * Statistics,BUCKETS * Buckets)2180 static PROTOTYPE *MakeSphericalProto(CLUSTERER *Clusterer, CLUSTER *Cluster, STATISTICS *Statistics,
2181 BUCKETS *Buckets) {
2182 PROTOTYPE *Proto = nullptr;
2183 int i;
2184
2185 // check that each dimension is a normal distribution
2186 for (i = 0; i < Clusterer->SampleSize; i++) {
2187 if (Clusterer->ParamDesc[i].NonEssential) {
2188 continue;
2189 }
2190
2191 FillBuckets(Buckets, Cluster, i, &(Clusterer->ParamDesc[i]), Cluster->Mean[i],
2192 sqrt(static_cast<double>(Statistics->AvgVariance)));
2193 if (!DistributionOK(Buckets)) {
2194 break;
2195 }
2196 }
2197 // if all dimensions matched a normal distribution, make a proto
2198 if (i >= Clusterer->SampleSize) {
2199 Proto = NewSphericalProto(Clusterer->SampleSize, Cluster, Statistics);
2200 }
2201 return (Proto);
2202 } // MakeSphericalProto
2203
2204 /**
2205 * This routine tests the specified cluster to see if it can
2206 * be approximated by an elliptical normal distribution. If it
2207 * can be, then a new prototype is formed and returned to the
2208 * caller. If it can't be, then nullptr is returned to the caller.
2209 * @param Clusterer data struct containing samples being clustered
2210 * @param Cluster cluster to be made into an elliptical prototype
2211 * @param Statistics statistical info about cluster
2212 * @param Buckets histogram struct used to analyze distribution
2213 * @return Pointer to new elliptical prototype or nullptr.
2214 */
MakeEllipticalProto(CLUSTERER * Clusterer,CLUSTER * Cluster,STATISTICS * Statistics,BUCKETS * Buckets)2215 static PROTOTYPE *MakeEllipticalProto(CLUSTERER *Clusterer, CLUSTER *Cluster,
2216 STATISTICS *Statistics, BUCKETS *Buckets) {
2217 PROTOTYPE *Proto = nullptr;
2218 int i;
2219
2220 // check that each dimension is a normal distribution
2221 for (i = 0; i < Clusterer->SampleSize; i++) {
2222 if (Clusterer->ParamDesc[i].NonEssential) {
2223 continue;
2224 }
2225
2226 FillBuckets(Buckets, Cluster, i, &(Clusterer->ParamDesc[i]), Cluster->Mean[i],
2227 sqrt(static_cast<double>(Statistics->CoVariance[i * (Clusterer->SampleSize + 1)])));
2228 if (!DistributionOK(Buckets)) {
2229 break;
2230 }
2231 }
2232 // if all dimensions matched a normal distribution, make a proto
2233 if (i >= Clusterer->SampleSize) {
2234 Proto = NewEllipticalProto(Clusterer->SampleSize, Cluster, Statistics);
2235 }
2236 return (Proto);
2237 } // MakeEllipticalProto
2238
2239 /**
2240 * This routine tests each dimension of the specified cluster to
2241 * see what distribution would best approximate that dimension.
2242 * Each dimension is compared to the following distributions
2243 * in order: normal, random, uniform. If each dimension can
2244 * be represented by one of these distributions,
2245 * then a new prototype is formed and returned to the
2246 * caller. If it can't be, then nullptr is returned to the caller.
2247 * @param Clusterer data struct containing samples being clustered
2248 * @param Cluster cluster to be made into a prototype
2249 * @param Statistics statistical info about cluster
2250 * @param NormalBuckets histogram struct used to analyze distribution
2251 * @param Confidence confidence level for alternate distributions
2252 * @return Pointer to new mixed prototype or nullptr.
2253 */
MakeMixedProto(CLUSTERER * Clusterer,CLUSTER * Cluster,STATISTICS * Statistics,BUCKETS * NormalBuckets,double Confidence)2254 static PROTOTYPE *MakeMixedProto(CLUSTERER *Clusterer, CLUSTER *Cluster, STATISTICS *Statistics,
2255 BUCKETS *NormalBuckets, double Confidence) {
2256 PROTOTYPE *Proto;
2257 int i;
2258 BUCKETS *UniformBuckets = nullptr;
2259 BUCKETS *RandomBuckets = nullptr;
2260
2261 // create a mixed proto to work on - initially assume all dimensions normal
2262 Proto = NewMixedProto(Clusterer->SampleSize, Cluster, Statistics);
2263
2264 // find the proper distribution for each dimension
2265 for (i = 0; i < Clusterer->SampleSize; i++) {
2266 if (Clusterer->ParamDesc[i].NonEssential) {
2267 continue;
2268 }
2269
2270 FillBuckets(NormalBuckets, Cluster, i, &(Clusterer->ParamDesc[i]), Proto->Mean[i],
2271 std::sqrt(Proto->Variance.Elliptical[i]));
2272 if (DistributionOK(NormalBuckets)) {
2273 continue;
2274 }
2275
2276 if (RandomBuckets == nullptr) {
2277 RandomBuckets = GetBuckets(Clusterer, D_random, Cluster->SampleCount, Confidence);
2278 }
2279 MakeDimRandom(i, Proto, &(Clusterer->ParamDesc[i]));
2280 FillBuckets(RandomBuckets, Cluster, i, &(Clusterer->ParamDesc[i]), Proto->Mean[i],
2281 Proto->Variance.Elliptical[i]);
2282 if (DistributionOK(RandomBuckets)) {
2283 continue;
2284 }
2285
2286 if (UniformBuckets == nullptr) {
2287 UniformBuckets = GetBuckets(Clusterer, uniform, Cluster->SampleCount, Confidence);
2288 }
2289 MakeDimUniform(i, Proto, Statistics);
2290 FillBuckets(UniformBuckets, Cluster, i, &(Clusterer->ParamDesc[i]), Proto->Mean[i],
2291 Proto->Variance.Elliptical[i]);
2292 if (DistributionOK(UniformBuckets)) {
2293 continue;
2294 }
2295 break;
2296 }
2297 // if any dimension failed to match a distribution, discard the proto
2298 if (i < Clusterer->SampleSize) {
2299 FreePrototype(Proto);
2300 Proto = nullptr;
2301 }
2302 return (Proto);
2303 } // MakeMixedProto
2304
2305 /**
2306 * This routine alters the ith dimension of the specified
2307 * mixed prototype to be D_random.
2308 * @param i index of dimension to be changed
2309 * @param Proto prototype whose dimension is to be altered
2310 * @param ParamDesc description of specified dimension
2311 */
MakeDimRandom(uint16_t i,PROTOTYPE * Proto,PARAM_DESC * ParamDesc)2312 static void MakeDimRandom(uint16_t i, PROTOTYPE *Proto, PARAM_DESC *ParamDesc) {
2313 Proto->Distrib[i] = D_random;
2314 Proto->Mean[i] = ParamDesc->MidRange;
2315 Proto->Variance.Elliptical[i] = ParamDesc->HalfRange;
2316
2317 // subtract out the previous magnitude of this dimension from the total
2318 Proto->TotalMagnitude /= Proto->Magnitude.Elliptical[i];
2319 Proto->Magnitude.Elliptical[i] = 1.0 / ParamDesc->Range;
2320 Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i];
2321 Proto->LogMagnitude = log(static_cast<double>(Proto->TotalMagnitude));
2322
2323 // note that the proto Weight is irrelevant for D_random protos
2324 } // MakeDimRandom
2325
2326 /**
2327 * This routine alters the ith dimension of the specified
2328 * mixed prototype to be uniform.
2329 * @param i index of dimension to be changed
2330 * @param Proto prototype whose dimension is to be altered
2331 * @param Statistics statistical info about prototype
2332 */
MakeDimUniform(uint16_t i,PROTOTYPE * Proto,STATISTICS * Statistics)2333 static void MakeDimUniform(uint16_t i, PROTOTYPE *Proto, STATISTICS *Statistics) {
2334 Proto->Distrib[i] = uniform;
2335 Proto->Mean[i] = Proto->Cluster->Mean[i] + (Statistics->Min[i] + Statistics->Max[i]) / 2;
2336 Proto->Variance.Elliptical[i] = (Statistics->Max[i] - Statistics->Min[i]) / 2;
2337 if (Proto->Variance.Elliptical[i] < MINVARIANCE) {
2338 Proto->Variance.Elliptical[i] = MINVARIANCE;
2339 }
2340
2341 // subtract out the previous magnitude of this dimension from the total
2342 Proto->TotalMagnitude /= Proto->Magnitude.Elliptical[i];
2343 Proto->Magnitude.Elliptical[i] = 1.0 / (2.0 * Proto->Variance.Elliptical[i]);
2344 Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i];
2345 Proto->LogMagnitude = log(static_cast<double>(Proto->TotalMagnitude));
2346
2347 // note that the proto Weight is irrelevant for uniform protos
2348 } // MakeDimUniform
2349
2350 /**
2351 * This routine searches the cluster tree for all leaf nodes
2352 * which are samples in the specified cluster. It computes
2353 * a full covariance matrix for these samples as well as
2354 * keeping track of the ranges (min and max) for each
2355 * dimension. A special data structure is allocated to
2356 * return this information to the caller. An incremental
2357 * algorithm for computing statistics is not used because
2358 * it will not work with circular dimensions.
2359 * @param N number of dimensions
2360 * @param ParamDesc array of dimension descriptions
2361 * @param Cluster cluster whose stats are to be computed
2362 * @return Pointer to new data structure containing statistics
2363 */
ComputeStatistics(int16_t N,PARAM_DESC ParamDesc[],CLUSTER * Cluster)2364 static STATISTICS *ComputeStatistics(int16_t N, PARAM_DESC ParamDesc[], CLUSTER *Cluster) {
2365 int i, j;
2366 LIST SearchState;
2367 SAMPLE *Sample;
2368 uint32_t SampleCountAdjustedForBias;
2369
2370 // allocate memory to hold the statistics results
2371 auto Statistics = new STATISTICS(N);
2372
2373 // allocate temporary memory to hold the sample to mean distances
2374 std::vector<float> Distance(N);
2375
2376 // find each sample in the cluster and merge it into the statistics
2377 InitSampleSearch(SearchState, Cluster);
2378 while ((Sample = NextSample(&SearchState)) != nullptr) {
2379 for (i = 0; i < N; i++) {
2380 Distance[i] = Sample->Mean[i] - Cluster->Mean[i];
2381 if (ParamDesc[i].Circular) {
2382 if (Distance[i] > ParamDesc[i].HalfRange) {
2383 Distance[i] -= ParamDesc[i].Range;
2384 }
2385 if (Distance[i] < -ParamDesc[i].HalfRange) {
2386 Distance[i] += ParamDesc[i].Range;
2387 }
2388 }
2389 if (Distance[i] < Statistics->Min[i]) {
2390 Statistics->Min[i] = Distance[i];
2391 }
2392 if (Distance[i] > Statistics->Max[i]) {
2393 Statistics->Max[i] = Distance[i];
2394 }
2395 }
2396 auto CoVariance = &Statistics->CoVariance[0];
2397 for (i = 0; i < N; i++) {
2398 for (j = 0; j < N; j++, CoVariance++) {
2399 *CoVariance += Distance[i] * Distance[j];
2400 }
2401 }
2402 }
2403 // normalize the variances by the total number of samples
2404 // use SampleCount-1 instead of SampleCount to get an unbiased estimate
2405 // also compute the geometic mean of the diagonal variances
2406 // ensure that clusters with only 1 sample are handled correctly
2407 if (Cluster->SampleCount > 1) {
2408 SampleCountAdjustedForBias = Cluster->SampleCount - 1;
2409 } else {
2410 SampleCountAdjustedForBias = 1;
2411 }
2412 auto CoVariance = &Statistics->CoVariance[0];
2413 for (i = 0; i < N; i++) {
2414 for (j = 0; j < N; j++, CoVariance++) {
2415 *CoVariance /= SampleCountAdjustedForBias;
2416 if (j == i) {
2417 if (*CoVariance < MINVARIANCE) {
2418 *CoVariance = MINVARIANCE;
2419 }
2420 Statistics->AvgVariance *= *CoVariance;
2421 }
2422 }
2423 }
2424 Statistics->AvgVariance =
2425 static_cast<float>(pow(static_cast<double>(Statistics->AvgVariance), 1.0 / N));
2426
2427 return Statistics;
2428 } // ComputeStatistics
2429
2430 /**
2431 * This routine creates a spherical prototype data structure to
2432 * approximate the samples in the specified cluster.
2433 * Spherical prototypes have a single variance which is
2434 * common across all dimensions. All dimensions are normally
2435 * distributed and independent.
2436 * @param N number of dimensions
2437 * @param Cluster cluster to be made into a spherical prototype
2438 * @param Statistics statistical info about samples in cluster
2439 * @return Pointer to a new spherical prototype data structure
2440 */
NewSphericalProto(uint16_t N,CLUSTER * Cluster,STATISTICS * Statistics)2441 static PROTOTYPE *NewSphericalProto(uint16_t N, CLUSTER *Cluster, STATISTICS *Statistics) {
2442 PROTOTYPE *Proto;
2443
2444 Proto = NewSimpleProto(N, Cluster);
2445
2446 Proto->Variance.Spherical = Statistics->AvgVariance;
2447 if (Proto->Variance.Spherical < MINVARIANCE) {
2448 Proto->Variance.Spherical = MINVARIANCE;
2449 }
2450
2451 Proto->Magnitude.Spherical = 1.0 / sqrt(2.0 * M_PI * Proto->Variance.Spherical);
2452 Proto->TotalMagnitude = static_cast<float>(
2453 pow(static_cast<double>(Proto->Magnitude.Spherical), static_cast<double>(N)));
2454 Proto->Weight.Spherical = 1.0 / Proto->Variance.Spherical;
2455 Proto->LogMagnitude = log(static_cast<double>(Proto->TotalMagnitude));
2456
2457 return (Proto);
2458 } // NewSphericalProto
2459
2460 /**
2461 * This routine creates an elliptical prototype data structure to
2462 * approximate the samples in the specified cluster.
2463 * Elliptical prototypes have a variance for each dimension.
2464 * All dimensions are normally distributed and independent.
2465 * @param N number of dimensions
2466 * @param Cluster cluster to be made into an elliptical prototype
2467 * @param Statistics statistical info about samples in cluster
2468 * @return Pointer to a new elliptical prototype data structure
2469 */
NewEllipticalProto(int16_t N,CLUSTER * Cluster,STATISTICS * Statistics)2470 static PROTOTYPE *NewEllipticalProto(int16_t N, CLUSTER *Cluster, STATISTICS *Statistics) {
2471 PROTOTYPE *Proto;
2472 int i;
2473
2474 Proto = NewSimpleProto(N, Cluster);
2475 Proto->Variance.Elliptical = new float[N];
2476 Proto->Magnitude.Elliptical = new float[N];
2477 Proto->Weight.Elliptical = new float[N];
2478
2479 auto CoVariance = &Statistics->CoVariance[0];
2480 Proto->TotalMagnitude = 1.0;
2481 for (i = 0; i < N; i++, CoVariance += N + 1) {
2482 Proto->Variance.Elliptical[i] = *CoVariance;
2483 if (Proto->Variance.Elliptical[i] < MINVARIANCE) {
2484 Proto->Variance.Elliptical[i] = MINVARIANCE;
2485 }
2486
2487 Proto->Magnitude.Elliptical[i] = 1.0f / sqrt(2.0f * M_PI * Proto->Variance.Elliptical[i]);
2488 Proto->Weight.Elliptical[i] = 1.0f / Proto->Variance.Elliptical[i];
2489 Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i];
2490 }
2491 Proto->LogMagnitude = log(static_cast<double>(Proto->TotalMagnitude));
2492 Proto->Style = elliptical;
2493 return (Proto);
2494 } // NewEllipticalProto
2495
2496 /**
2497 * This routine creates a mixed prototype data structure to
2498 * approximate the samples in the specified cluster.
2499 * Mixed prototypes can have different distributions for
2500 * each dimension. All dimensions are independent. The
2501 * structure is initially filled in as though it were an
2502 * elliptical prototype. The actual distributions of the
2503 * dimensions can be altered by other routines.
2504 * @param N number of dimensions
2505 * @param Cluster cluster to be made into a mixed prototype
2506 * @param Statistics statistical info about samples in cluster
2507 * @return Pointer to a new mixed prototype data structure
2508 */
NewMixedProto(int16_t N,CLUSTER * Cluster,STATISTICS * Statistics)2509 static PROTOTYPE *NewMixedProto(int16_t N, CLUSTER *Cluster, STATISTICS *Statistics) {
2510 auto Proto = NewEllipticalProto(N, Cluster, Statistics);
2511 Proto->Distrib.clear();
2512 Proto->Distrib.resize(N, normal);
2513 Proto->Style = mixed;
2514 return Proto;
2515 } // NewMixedProto
2516
2517 /**
2518 * This routine allocates memory to hold a simple prototype
2519 * data structure, i.e. one without independent distributions
2520 * and variances for each dimension.
2521 * @param N number of dimensions
2522 * @param Cluster cluster to be made into a prototype
2523 * @return Pointer to new simple prototype
2524 */
NewSimpleProto(int16_t N,CLUSTER * Cluster)2525 static PROTOTYPE *NewSimpleProto(int16_t N, CLUSTER *Cluster) {
2526 auto Proto = new PROTOTYPE;
2527 ASSERT_HOST(N == sizeof(Cluster->Mean));
2528 Proto->Mean = Cluster->Mean;
2529 Proto->Distrib.clear();
2530 Proto->Significant = true;
2531 Proto->Merged = false;
2532 Proto->Style = spherical;
2533 Proto->NumSamples = Cluster->SampleCount;
2534 Proto->Cluster = Cluster;
2535 Proto->Cluster->Prototype = true;
2536 return Proto;
2537 } // NewSimpleProto
2538
2539 /**
2540 * This routine returns true if the specified covariance
2541 * matrix indicates that all N dimensions are independent of
2542 * one another. One dimension is judged to be independent of
2543 * another when the magnitude of the corresponding correlation
2544 * coefficient is
2545 * less than the specified Independence factor. The
2546 * correlation coefficient is calculated as: (see Duda and
2547 * Hart, pg. 247)
2548 * coeff[ij] = stddev[ij] / sqrt (stddev[ii] * stddev[jj])
2549 * The covariance matrix is assumed to be symmetric (which
2550 * should always be true).
2551 * @param ParamDesc descriptions of each feature space dimension
2552 * @param N number of dimensions
2553 * @param CoVariance ptr to a covariance matrix
2554 * @param Independence max off-diagonal correlation coefficient
2555 * @return true if dimensions are independent, false otherwise
2556 */
Independent(PARAM_DESC * ParamDesc,int16_t N,float * CoVariance,float Independence)2557 static bool Independent(PARAM_DESC *ParamDesc, int16_t N, float *CoVariance, float Independence) {
2558 int i, j;
2559 float *VARii; // points to ith on-diagonal element
2560 float *VARjj; // points to jth on-diagonal element
2561 float CorrelationCoeff;
2562
2563 VARii = CoVariance;
2564 for (i = 0; i < N; i++, VARii += N + 1) {
2565 if (ParamDesc[i].NonEssential) {
2566 continue;
2567 }
2568
2569 VARjj = VARii + N + 1;
2570 CoVariance = VARii + 1;
2571 for (j = i + 1; j < N; j++, CoVariance++, VARjj += N + 1) {
2572 if (ParamDesc[j].NonEssential) {
2573 continue;
2574 }
2575
2576 if ((*VARii == 0.0) || (*VARjj == 0.0)) {
2577 CorrelationCoeff = 0.0;
2578 } else {
2579 CorrelationCoeff = sqrt(std::sqrt(*CoVariance * *CoVariance / (*VARii * *VARjj)));
2580 }
2581 if (CorrelationCoeff > Independence) {
2582 return false;
2583 }
2584 }
2585 }
2586 return true;
2587 } // Independent
2588
2589 /**
2590 * This routine returns a histogram data structure which can
2591 * be used by other routines to place samples into histogram
2592 * buckets, and then apply a goodness of fit test to the
2593 * histogram data to determine if the samples belong to the
2594 * specified probability distribution. The routine keeps
2595 * a list of bucket data structures which have already been
2596 * created so that it minimizes the computation time needed
2597 * to create a new bucket.
2598 * @param clusterer which keeps a bucket_cache for us.
2599 * @param Distribution type of probability distribution to test for
2600 * @param SampleCount number of samples that are available
2601 * @param Confidence probability of a Type I error
2602 * @return Bucket data structure
2603 */
GetBuckets(CLUSTERER * clusterer,DISTRIBUTION Distribution,uint32_t SampleCount,double Confidence)2604 static BUCKETS *GetBuckets(CLUSTERER *clusterer, DISTRIBUTION Distribution, uint32_t SampleCount,
2605 double Confidence) {
2606 // Get an old bucket structure with the same number of buckets.
2607 uint16_t NumberOfBuckets = OptimumNumberOfBuckets(SampleCount);
2608 BUCKETS *Buckets = clusterer->bucket_cache[Distribution][NumberOfBuckets - MINBUCKETS];
2609
2610 // If a matching bucket structure is not found, make one and save it.
2611 if (Buckets == nullptr) {
2612 Buckets = MakeBuckets(Distribution, SampleCount, Confidence);
2613 clusterer->bucket_cache[Distribution][NumberOfBuckets - MINBUCKETS] = Buckets;
2614 } else {
2615 // Just adjust the existing buckets.
2616 if (SampleCount != Buckets->SampleCount) {
2617 AdjustBuckets(Buckets, SampleCount);
2618 }
2619 if (Confidence != Buckets->Confidence) {
2620 Buckets->Confidence = Confidence;
2621 Buckets->ChiSquared =
2622 ComputeChiSquared(DegreesOfFreedom(Distribution, Buckets->NumberOfBuckets), Confidence);
2623 }
2624 InitBuckets(Buckets);
2625 }
2626 return Buckets;
2627 } // GetBuckets
2628
2629 /**
2630 * This routine creates a histogram data structure which can
2631 * be used by other routines to place samples into histogram
2632 * buckets, and then apply a goodness of fit test to the
2633 * histogram data to determine if the samples belong to the
2634 * specified probability distribution. The buckets are
2635 * allocated in such a way that the expected frequency of
2636 * samples in each bucket is approximately the same. In
2637 * order to make this possible, a mapping table is
2638 * computed which maps "normalized" samples into the
2639 * appropriate bucket.
2640 * @param Distribution type of probability distribution to test for
2641 * @param SampleCount number of samples that are available
2642 * @param Confidence probability of a Type I error
2643 * @return Pointer to new histogram data structure
2644 */
MakeBuckets(DISTRIBUTION Distribution,uint32_t SampleCount,double Confidence)2645 static BUCKETS *MakeBuckets(DISTRIBUTION Distribution, uint32_t SampleCount, double Confidence) {
2646 const DENSITYFUNC DensityFunction[] = {NormalDensity, UniformDensity, UniformDensity};
2647 int i, j;
2648 double BucketProbability;
2649 double NextBucketBoundary;
2650 double Probability;
2651 double ProbabilityDelta;
2652 double LastProbDensity;
2653 double ProbDensity;
2654 uint16_t CurrentBucket;
2655 bool Symmetrical;
2656
2657 // allocate memory needed for data structure
2658 auto Buckets = new BUCKETS(OptimumNumberOfBuckets(SampleCount));
2659 Buckets->SampleCount = SampleCount;
2660 Buckets->Confidence = Confidence;
2661
2662 // initialize simple fields
2663 Buckets->Distribution = Distribution;
2664
2665 // all currently defined distributions are symmetrical
2666 Symmetrical = true;
2667 Buckets->ChiSquared =
2668 ComputeChiSquared(DegreesOfFreedom(Distribution, Buckets->NumberOfBuckets), Confidence);
2669
2670 if (Symmetrical) {
2671 // allocate buckets so that all have approx. equal probability
2672 BucketProbability = 1.0 / static_cast<double>(Buckets->NumberOfBuckets);
2673
2674 // distribution is symmetric so fill in upper half then copy
2675 CurrentBucket = Buckets->NumberOfBuckets / 2;
2676 if (Odd(Buckets->NumberOfBuckets)) {
2677 NextBucketBoundary = BucketProbability / 2;
2678 } else {
2679 NextBucketBoundary = BucketProbability;
2680 }
2681
2682 Probability = 0.0;
2683 LastProbDensity = (*DensityFunction[static_cast<int>(Distribution)])(BUCKETTABLESIZE / 2);
2684 for (i = BUCKETTABLESIZE / 2; i < BUCKETTABLESIZE; i++) {
2685 ProbDensity = (*DensityFunction[static_cast<int>(Distribution)])(i + 1);
2686 ProbabilityDelta = Integral(LastProbDensity, ProbDensity, 1.0);
2687 Probability += ProbabilityDelta;
2688 if (Probability > NextBucketBoundary) {
2689 if (CurrentBucket < Buckets->NumberOfBuckets - 1) {
2690 CurrentBucket++;
2691 }
2692 NextBucketBoundary += BucketProbability;
2693 }
2694 Buckets->Bucket[i] = CurrentBucket;
2695 Buckets->ExpectedCount[CurrentBucket] += static_cast<float>(ProbabilityDelta * SampleCount);
2696 LastProbDensity = ProbDensity;
2697 }
2698 // place any leftover probability into the last bucket
2699 Buckets->ExpectedCount[CurrentBucket] += static_cast<float>((0.5 - Probability) * SampleCount);
2700
2701 // copy upper half of distribution to lower half
2702 for (i = 0, j = BUCKETTABLESIZE - 1; i < j; i++, j--) {
2703 Buckets->Bucket[i] = Mirror(Buckets->Bucket[j], Buckets->NumberOfBuckets);
2704 }
2705
2706 // copy upper half of expected counts to lower half
2707 for (i = 0, j = Buckets->NumberOfBuckets - 1; i <= j; i++, j--) {
2708 Buckets->ExpectedCount[i] += Buckets->ExpectedCount[j];
2709 }
2710 }
2711 return Buckets;
2712 } // MakeBuckets
2713
2714 /**
2715 * This routine computes the optimum number of histogram
2716 * buckets that should be used in a chi-squared goodness of
2717 * fit test for the specified number of samples. The optimum
2718 * number is computed based on Table 4.1 on pg. 147 of
2719 * "Measurement and Analysis of Random Data" by Bendat & Piersol.
2720 * Linear interpolation is used to interpolate between table
2721 * values. The table is intended for a 0.05 level of
2722 * significance (alpha). This routine assumes that it is
2723 * equally valid for other alpha's, which may not be true.
2724 * @param SampleCount number of samples to be tested
2725 * @return Optimum number of histogram buckets
2726 */
OptimumNumberOfBuckets(uint32_t SampleCount)2727 static uint16_t OptimumNumberOfBuckets(uint32_t SampleCount) {
2728 uint8_t Last, Next;
2729 float Slope;
2730
2731 if (SampleCount < kCountTable[0]) {
2732 return kBucketsTable[0];
2733 }
2734
2735 for (Last = 0, Next = 1; Next < LOOKUPTABLESIZE; Last++, Next++) {
2736 if (SampleCount <= kCountTable[Next]) {
2737 Slope = static_cast<float>(kBucketsTable[Next] - kBucketsTable[Last]) /
2738 static_cast<float>(kCountTable[Next] - kCountTable[Last]);
2739 return (
2740 static_cast<uint16_t>(kBucketsTable[Last] + Slope * (SampleCount - kCountTable[Last])));
2741 }
2742 }
2743 return kBucketsTable[Last];
2744 } // OptimumNumberOfBuckets
2745
2746 /**
2747 * This routine computes the chi-squared value which will
2748 * leave a cumulative probability of Alpha in the right tail
2749 * of a chi-squared distribution with the specified number of
2750 * degrees of freedom. Alpha must be between 0 and 1.
2751 * DegreesOfFreedom must be even. The routine maintains an
2752 * array of lists. Each list corresponds to a different
2753 * number of degrees of freedom. Each entry in the list
2754 * corresponds to a different alpha value and its corresponding
2755 * chi-squared value. Therefore, once a particular chi-squared
2756 * value is computed, it is stored in the list and never
2757 * needs to be computed again.
2758 * @param DegreesOfFreedom determines shape of distribution
2759 * @param Alpha probability of right tail
2760 * @return Desired chi-squared value
2761 */
ComputeChiSquared(uint16_t DegreesOfFreedom,double Alpha)2762 static double ComputeChiSquared(uint16_t DegreesOfFreedom, double Alpha)
2763 #define CHIACCURACY 0.01
2764 #define MINALPHA (1e-200)
2765 {
2766 static LIST ChiWith[MAXDEGREESOFFREEDOM + 1];
2767
2768 // limit the minimum alpha that can be used - if alpha is too small
2769 // it may not be possible to compute chi-squared.
2770 Alpha = ClipToRange(Alpha, MINALPHA, 1.0);
2771 if (Odd(DegreesOfFreedom)) {
2772 DegreesOfFreedom++;
2773 }
2774
2775 /* find the list of chi-squared values which have already been computed
2776 for the specified number of degrees of freedom. Search the list for
2777 the desired chi-squared. */
2778 CHISTRUCT SearchKey(0.0, Alpha);
2779 auto OldChiSquared = reinterpret_cast<CHISTRUCT *>(
2780 search(ChiWith[DegreesOfFreedom], &SearchKey, AlphaMatch)->first_node());
2781
2782 if (OldChiSquared == nullptr) {
2783 OldChiSquared = new CHISTRUCT(DegreesOfFreedom, Alpha);
2784 OldChiSquared->ChiSquared =
2785 Solve(ChiArea, OldChiSquared, static_cast<double>(DegreesOfFreedom), CHIACCURACY);
2786 ChiWith[DegreesOfFreedom] = push(ChiWith[DegreesOfFreedom], OldChiSquared);
2787 } else {
2788 // further optimization might move OldChiSquared to front of list
2789 }
2790
2791 return (OldChiSquared->ChiSquared);
2792
2793 } // ComputeChiSquared
2794
2795 /**
2796 * This routine computes the probability density function
2797 * of a discrete normal distribution defined by the global
2798 * variables kNormalMean, kNormalVariance, and kNormalMagnitude.
2799 * Normal magnitude could, of course, be computed in terms of
2800 * the normal variance but it is precomputed for efficiency.
2801 * @param x number to compute the normal probability density for
2802 * @note Globals:
2803 * kNormalMean mean of a discrete normal distribution
2804 * kNormalVariance variance of a discrete normal distribution
2805 * kNormalMagnitude magnitude of a discrete normal distribution
2806 * @return The value of the normal distribution at x.
2807 */
NormalDensity(int32_t x)2808 static double NormalDensity(int32_t x) {
2809 double Distance;
2810
2811 Distance = x - kNormalMean;
2812 return kNormalMagnitude * exp(-0.5 * Distance * Distance / kNormalVariance);
2813 } // NormalDensity
2814
2815 /**
2816 * This routine computes the probability density function
2817 * of a uniform distribution at the specified point. The
2818 * range of the distribution is from 0 to BUCKETTABLESIZE.
2819 * @param x number to compute the uniform probability density for
2820 * @return The value of the uniform distribution at x.
2821 */
UniformDensity(int32_t x)2822 static double UniformDensity(int32_t x) {
2823 constexpr auto UniformDistributionDensity = 1.0 / BUCKETTABLESIZE;
2824
2825 if ((x >= 0) && (x <= BUCKETTABLESIZE)) {
2826 return UniformDistributionDensity;
2827 } else {
2828 return 0.0;
2829 }
2830 } // UniformDensity
2831
2832 /**
2833 * This routine computes a trapezoidal approximation to the
2834 * integral of a function over a small delta in x.
2835 * @param f1 value of function at x1
2836 * @param f2 value of function at x2
2837 * @param Dx x2 - x1 (should always be positive)
2838 * @return Approximation of the integral of the function from x1 to x2.
2839 */
Integral(double f1,double f2,double Dx)2840 static double Integral(double f1, double f2, double Dx) {
2841 return (f1 + f2) * Dx / 2.0;
2842 } // Integral
2843
2844 /**
2845 * This routine counts the number of cluster samples which
2846 * fall within the various histogram buckets in Buckets. Only
2847 * one dimension of each sample is examined. The exact meaning
2848 * of the Mean and StdDev parameters depends on the
2849 * distribution which is being analyzed (this info is in the
2850 * Buckets data structure). For normal distributions, Mean
2851 * and StdDev have the expected meanings. For uniform and
2852 * random distributions the Mean is the center point of the
2853 * range and the StdDev is 1/2 the range. A dimension with
2854 * zero standard deviation cannot be statistically analyzed.
2855 * In this case, a pseudo-analysis is used.
2856 * The Buckets data structure is filled in.
2857 * @param Buckets histogram buckets to count samples
2858 * @param Cluster cluster whose samples are being analyzed
2859 * @param Dim dimension of samples which is being analyzed
2860 * @param ParamDesc description of the dimension
2861 * @param Mean "mean" of the distribution
2862 * @param StdDev "standard deviation" of the distribution
2863 */
FillBuckets(BUCKETS * Buckets,CLUSTER * Cluster,uint16_t Dim,PARAM_DESC * ParamDesc,float Mean,float StdDev)2864 static void FillBuckets(BUCKETS *Buckets, CLUSTER *Cluster, uint16_t Dim, PARAM_DESC *ParamDesc,
2865 float Mean, float StdDev) {
2866 uint16_t BucketID;
2867 int i;
2868 LIST SearchState;
2869 SAMPLE *Sample;
2870
2871 // initialize the histogram bucket counts to 0
2872 for (i = 0; i < Buckets->NumberOfBuckets; i++) {
2873 Buckets->Count[i] = 0;
2874 }
2875
2876 if (StdDev == 0.0) {
2877 /* if the standard deviation is zero, then we can't statistically
2878 analyze the cluster. Use a pseudo-analysis: samples exactly on
2879 the mean are distributed evenly across all buckets. Samples greater
2880 than the mean are placed in the last bucket; samples less than the
2881 mean are placed in the first bucket. */
2882
2883 InitSampleSearch(SearchState, Cluster);
2884 i = 0;
2885 while ((Sample = NextSample(&SearchState)) != nullptr) {
2886 if (Sample->Mean[Dim] > Mean) {
2887 BucketID = Buckets->NumberOfBuckets - 1;
2888 } else if (Sample->Mean[Dim] < Mean) {
2889 BucketID = 0;
2890 } else {
2891 BucketID = i;
2892 }
2893 Buckets->Count[BucketID] += 1;
2894 i++;
2895 if (i >= Buckets->NumberOfBuckets) {
2896 i = 0;
2897 }
2898 }
2899 } else {
2900 // search for all samples in the cluster and add to histogram buckets
2901 InitSampleSearch(SearchState, Cluster);
2902 while ((Sample = NextSample(&SearchState)) != nullptr) {
2903 switch (Buckets->Distribution) {
2904 case normal:
2905 BucketID = NormalBucket(ParamDesc, Sample->Mean[Dim], Mean, StdDev);
2906 break;
2907 case D_random:
2908 case uniform:
2909 BucketID = UniformBucket(ParamDesc, Sample->Mean[Dim], Mean, StdDev);
2910 break;
2911 default:
2912 BucketID = 0;
2913 }
2914 Buckets->Count[Buckets->Bucket[BucketID]] += 1;
2915 }
2916 }
2917 } // FillBuckets
2918
2919 /**
2920 * This routine determines which bucket x falls into in the
2921 * discrete normal distribution defined by kNormalMean
2922 * and kNormalStdDev. x values which exceed the range of
2923 * the discrete distribution are clipped.
2924 * @param ParamDesc used to identify circular dimensions
2925 * @param x value to be normalized
2926 * @param Mean mean of normal distribution
2927 * @param StdDev standard deviation of normal distribution
2928 * @return Bucket number into which x falls
2929 */
NormalBucket(PARAM_DESC * ParamDesc,float x,float Mean,float StdDev)2930 static uint16_t NormalBucket(PARAM_DESC *ParamDesc, float x, float Mean, float StdDev) {
2931 float X;
2932
2933 // wraparound circular parameters if necessary
2934 if (ParamDesc->Circular) {
2935 if (x - Mean > ParamDesc->HalfRange) {
2936 x -= ParamDesc->Range;
2937 } else if (x - Mean < -ParamDesc->HalfRange) {
2938 x += ParamDesc->Range;
2939 }
2940 }
2941
2942 X = ((x - Mean) / StdDev) * kNormalStdDev + kNormalMean;
2943 if (X < 0) {
2944 return 0;
2945 }
2946 if (X > BUCKETTABLESIZE - 1) {
2947 return (static_cast<uint16_t>(BUCKETTABLESIZE - 1));
2948 }
2949 return static_cast<uint16_t>(floor(static_cast<double>(X)));
2950 } // NormalBucket
2951
2952 /**
2953 * This routine determines which bucket x falls into in the
2954 * discrete uniform distribution defined by
2955 * BUCKETTABLESIZE. x values which exceed the range of
2956 * the discrete distribution are clipped.
2957 * @param ParamDesc used to identify circular dimensions
2958 * @param x value to be normalized
2959 * @param Mean center of range of uniform distribution
2960 * @param StdDev 1/2 the range of the uniform distribution
2961 * @return Bucket number into which x falls
2962 */
UniformBucket(PARAM_DESC * ParamDesc,float x,float Mean,float StdDev)2963 static uint16_t UniformBucket(PARAM_DESC *ParamDesc, float x, float Mean, float StdDev) {
2964 float X;
2965
2966 // wraparound circular parameters if necessary
2967 if (ParamDesc->Circular) {
2968 if (x - Mean > ParamDesc->HalfRange) {
2969 x -= ParamDesc->Range;
2970 } else if (x - Mean < -ParamDesc->HalfRange) {
2971 x += ParamDesc->Range;
2972 }
2973 }
2974
2975 X = ((x - Mean) / (2 * StdDev) * BUCKETTABLESIZE + BUCKETTABLESIZE / 2.0);
2976 if (X < 0) {
2977 return 0;
2978 }
2979 if (X > BUCKETTABLESIZE - 1) {
2980 return static_cast<uint16_t>(BUCKETTABLESIZE - 1);
2981 }
2982 return static_cast<uint16_t>(floor(static_cast<double>(X)));
2983 } // UniformBucket
2984
2985 /**
2986 * This routine performs a chi-square goodness of fit test
2987 * on the histogram data in the Buckets data structure.
2988 * true is returned if the histogram matches the probability
2989 * distribution which was specified when the Buckets
2990 * structure was originally created. Otherwise false is
2991 * returned.
2992 * @param Buckets histogram data to perform chi-square test on
2993 * @return true if samples match distribution, false otherwise
2994 */
DistributionOK(BUCKETS * Buckets)2995 static bool DistributionOK(BUCKETS *Buckets) {
2996 float FrequencyDifference;
2997 float TotalDifference;
2998 int i;
2999
3000 // compute how well the histogram matches the expected histogram
3001 TotalDifference = 0.0;
3002 for (i = 0; i < Buckets->NumberOfBuckets; i++) {
3003 FrequencyDifference = Buckets->Count[i] - Buckets->ExpectedCount[i];
3004 TotalDifference += (FrequencyDifference * FrequencyDifference) / Buckets->ExpectedCount[i];
3005 }
3006
3007 // test to see if the difference is more than expected
3008 if (TotalDifference > Buckets->ChiSquared) {
3009 return false;
3010 } else {
3011 return true;
3012 }
3013 } // DistributionOK
3014
3015 /**
3016 * This routine computes the degrees of freedom that should
3017 * be used in a chi-squared test with the specified number of
3018 * histogram buckets. The result is always rounded up to
3019 * the next even number so that the value of chi-squared can be
3020 * computed more easily. This will cause the value of
3021 * chi-squared to be higher than the optimum value, resulting
3022 * in the chi-square test being more lenient than optimum.
3023 * @param Distribution distribution being tested for
3024 * @param HistogramBuckets number of buckets in chi-square test
3025 * @return The number of degrees of freedom for a chi-square test
3026 */
DegreesOfFreedom(DISTRIBUTION Distribution,uint16_t HistogramBuckets)3027 static uint16_t DegreesOfFreedom(DISTRIBUTION Distribution, uint16_t HistogramBuckets) {
3028 static uint8_t DegreeOffsets[] = {3, 3, 1};
3029
3030 uint16_t AdjustedNumBuckets;
3031
3032 AdjustedNumBuckets = HistogramBuckets - DegreeOffsets[static_cast<int>(Distribution)];
3033 if (Odd(AdjustedNumBuckets)) {
3034 AdjustedNumBuckets++;
3035 }
3036 return (AdjustedNumBuckets);
3037
3038 } // DegreesOfFreedom
3039
3040 /**
3041 * This routine multiplies each ExpectedCount histogram entry
3042 * by NewSampleCount/OldSampleCount so that the histogram
3043 * is now adjusted to the new sample count.
3044 * @param Buckets histogram data structure to adjust
3045 * @param NewSampleCount new sample count to adjust to
3046 */
AdjustBuckets(BUCKETS * Buckets,uint32_t NewSampleCount)3047 static void AdjustBuckets(BUCKETS *Buckets, uint32_t NewSampleCount) {
3048 int i;
3049 double AdjustFactor;
3050
3051 AdjustFactor =
3052 ((static_cast<double>(NewSampleCount)) / (static_cast<double>(Buckets->SampleCount)));
3053
3054 for (i = 0; i < Buckets->NumberOfBuckets; i++) {
3055 Buckets->ExpectedCount[i] *= AdjustFactor;
3056 }
3057
3058 Buckets->SampleCount = NewSampleCount;
3059
3060 } // AdjustBuckets
3061
3062 /**
3063 * This routine sets the bucket counts in the specified histogram
3064 * to zero.
3065 * @param Buckets histogram data structure to init
3066 */
InitBuckets(BUCKETS * Buckets)3067 static void InitBuckets(BUCKETS *Buckets) {
3068 int i;
3069
3070 for (i = 0; i < Buckets->NumberOfBuckets; i++) {
3071 Buckets->Count[i] = 0;
3072 }
3073
3074 } // InitBuckets
3075
3076 /**
3077 * This routine is used to search a list of structures which
3078 * hold pre-computed chi-squared values for a chi-squared
3079 * value whose corresponding alpha field matches the alpha
3080 * field of SearchKey.
3081 *
3082 * It is called by the list search routines.
3083 *
3084 * @param arg1 chi-squared struct being tested for a match
3085 * @param arg2 chi-squared struct that is the search key
3086 * @return true if ChiStruct's Alpha matches SearchKey's Alpha
3087 */
AlphaMatch(void * arg1,void * arg2)3088 static int AlphaMatch(void *arg1, // CHISTRUCT *ChiStruct,
3089 void *arg2) { // CHISTRUCT *SearchKey)
3090 auto *ChiStruct = static_cast<CHISTRUCT *>(arg1);
3091 auto *SearchKey = static_cast<CHISTRUCT *>(arg2);
3092
3093 return (ChiStruct->Alpha == SearchKey->Alpha);
3094
3095 } // AlphaMatch
3096
3097 /**
3098 * This routine attempts to find an x value at which Function
3099 * goes to zero (i.e. a root of the function). It will only
3100 * work correctly if a solution actually exists and there
3101 * are no extrema between the solution and the InitialGuess.
3102 * The algorithms used are extremely primitive.
3103 *
3104 * @param Function function whose zero is to be found
3105 * @param FunctionParams arbitrary data to pass to function
3106 * @param InitialGuess point to start solution search at
3107 * @param Accuracy maximum allowed error
3108 * @return Solution of function (x for which f(x) = 0).
3109 */
Solve(SOLVEFUNC Function,void * FunctionParams,double InitialGuess,double Accuracy)3110 static double Solve(SOLVEFUNC Function, void *FunctionParams, double InitialGuess, double Accuracy)
3111 #define INITIALDELTA 0.1
3112 #define DELTARATIO 0.1
3113 {
3114 double x;
3115 double f;
3116 double Slope;
3117 double Delta;
3118 double NewDelta;
3119 double xDelta;
3120 double LastPosX, LastNegX;
3121
3122 x = InitialGuess;
3123 Delta = INITIALDELTA;
3124 LastPosX = FLT_MAX;
3125 LastNegX = -FLT_MAX;
3126 f = (*Function)(static_cast<CHISTRUCT *>(FunctionParams), x);
3127 while (Abs(LastPosX - LastNegX) > Accuracy) {
3128 // keep track of outer bounds of current estimate
3129 if (f < 0) {
3130 LastNegX = x;
3131 } else {
3132 LastPosX = x;
3133 }
3134
3135 // compute the approx. slope of f(x) at the current point
3136 Slope = ((*Function)(static_cast<CHISTRUCT *>(FunctionParams), x + Delta) - f) / Delta;
3137
3138 // compute the next solution guess */
3139 xDelta = f / Slope;
3140 x -= xDelta;
3141
3142 // reduce the delta used for computing slope to be a fraction of
3143 // the amount moved to get to the new guess
3144 NewDelta = Abs(xDelta) * DELTARATIO;
3145 if (NewDelta < Delta) {
3146 Delta = NewDelta;
3147 }
3148
3149 // compute the value of the function at the new guess
3150 f = (*Function)(static_cast<CHISTRUCT *>(FunctionParams), x);
3151 }
3152 return (x);
3153
3154 } // Solve
3155
3156 /**
3157 * This routine computes the area under a chi density curve
3158 * from 0 to x, minus the desired area under the curve. The
3159 * number of degrees of freedom of the chi curve is specified
3160 * in the ChiParams structure. The desired area is also
3161 * specified in the ChiParams structure as Alpha (or 1 minus
3162 * the desired area). This routine is intended to be passed
3163 * to the Solve() function to find the value of chi-squared
3164 * which will yield a desired area under the right tail of
3165 * the chi density curve. The function will only work for
3166 * even degrees of freedom. The equations are based on
3167 * integrating the chi density curve in parts to obtain
3168 * a series that can be used to compute the area under the
3169 * curve.
3170 * @param ChiParams contains degrees of freedom and alpha
3171 * @param x value of chi-squared to evaluate
3172 * @return Error between actual and desired area under the chi curve.
3173 */
ChiArea(CHISTRUCT * ChiParams,double x)3174 static double ChiArea(CHISTRUCT *ChiParams, double x) {
3175 int i, N;
3176 double SeriesTotal;
3177 double Denominator;
3178 double PowerOfx;
3179
3180 N = ChiParams->DegreesOfFreedom / 2 - 1;
3181 SeriesTotal = 1;
3182 Denominator = 1;
3183 PowerOfx = 1;
3184 for (i = 1; i <= N; i++) {
3185 Denominator *= 2 * i;
3186 PowerOfx *= x;
3187 SeriesTotal += PowerOfx / Denominator;
3188 }
3189 return ((SeriesTotal * exp(-0.5 * x)) - ChiParams->Alpha);
3190
3191 } // ChiArea
3192
3193 /**
3194 * This routine looks at all samples in the specified cluster.
3195 * It computes a running estimate of the percentage of the
3196 * characters which have more than 1 sample in the cluster.
3197 * When this percentage exceeds MaxIllegal, true is returned.
3198 * Otherwise false is returned. The CharID
3199 * fields must contain integers which identify the training
3200 * characters which were used to generate the sample. One
3201 * integer is used for each sample. The NumChar field in
3202 * the Clusterer must contain the number of characters in the
3203 * training set. All CharID fields must be between 0 and
3204 * NumChar-1. The main function of this routine is to help
3205 * identify clusters which need to be split further, i.e. if
3206 * numerous training characters have 2 or more features which are
3207 * contained in the same cluster, then the cluster should be
3208 * split.
3209 *
3210 * @param Clusterer data structure holding cluster tree
3211 * @param Cluster cluster containing samples to be tested
3212 * @param MaxIllegal max percentage of samples allowed to have
3213 * more than 1 feature in the cluster
3214 * @return true if the cluster should be split, false otherwise.
3215 */
MultipleCharSamples(CLUSTERER * Clusterer,CLUSTER * Cluster,float MaxIllegal)3216 static bool MultipleCharSamples(CLUSTERER *Clusterer, CLUSTER *Cluster, float MaxIllegal)
3217 #define ILLEGAL_CHAR 2
3218 {
3219 static std::vector<uint8_t> CharFlags;
3220 LIST SearchState;
3221 SAMPLE *Sample;
3222 int32_t CharID;
3223 int32_t NumCharInCluster;
3224 int32_t NumIllegalInCluster;
3225 float PercentIllegal;
3226
3227 // initial estimate assumes that no illegal chars exist in the cluster
3228 NumCharInCluster = Cluster->SampleCount;
3229 NumIllegalInCluster = 0;
3230
3231 if (Clusterer->NumChar > CharFlags.size()) {
3232 CharFlags.resize(Clusterer->NumChar);
3233 }
3234
3235 for (auto &CharFlag : CharFlags) {
3236 CharFlag = false;
3237 }
3238
3239 // find each sample in the cluster and check if we have seen it before
3240 InitSampleSearch(SearchState, Cluster);
3241 while ((Sample = NextSample(&SearchState)) != nullptr) {
3242 CharID = Sample->CharID;
3243 if (CharFlags[CharID] == false) {
3244 CharFlags[CharID] = true;
3245 } else {
3246 if (CharFlags[CharID] == true) {
3247 NumIllegalInCluster++;
3248 CharFlags[CharID] = ILLEGAL_CHAR;
3249 }
3250 NumCharInCluster--;
3251 PercentIllegal = static_cast<float>(NumIllegalInCluster) / NumCharInCluster;
3252 if (PercentIllegal > MaxIllegal) {
3253 destroy(SearchState);
3254 return true;
3255 }
3256 }
3257 }
3258 return false;
3259
3260 } // MultipleCharSamples
3261
3262 /**
3263 * Compute the inverse of a matrix using LU decomposition with partial pivoting.
3264 * The return value is the sum of norms of the off-diagonal terms of the
3265 * product of a and inv. (A measure of the error.)
3266 */
InvertMatrix(const float * input,int size,float * inv)3267 static double InvertMatrix(const float *input, int size, float *inv) {
3268 // Allocate memory for the 2D arrays.
3269 GENERIC_2D_ARRAY<double> U(size, size, 0.0);
3270 GENERIC_2D_ARRAY<double> U_inv(size, size, 0.0);
3271 GENERIC_2D_ARRAY<double> L(size, size, 0.0);
3272
3273 // Initialize the working matrices. U starts as input, L as I and U_inv as O.
3274 int row;
3275 int col;
3276 for (row = 0; row < size; row++) {
3277 for (col = 0; col < size; col++) {
3278 U[row][col] = input[row * size + col];
3279 L[row][col] = row == col ? 1.0 : 0.0;
3280 U_inv[row][col] = 0.0;
3281 }
3282 }
3283
3284 // Compute forward matrix by inversion by LU decomposition of input.
3285 for (col = 0; col < size; ++col) {
3286 // Find best pivot
3287 int best_row = 0;
3288 double best_pivot = -1.0;
3289 for (row = col; row < size; ++row) {
3290 if (Abs(U[row][col]) > best_pivot) {
3291 best_pivot = Abs(U[row][col]);
3292 best_row = row;
3293 }
3294 }
3295 // Exchange pivot rows.
3296 if (best_row != col) {
3297 for (int k = 0; k < size; ++k) {
3298 double tmp = U[best_row][k];
3299 U[best_row][k] = U[col][k];
3300 U[col][k] = tmp;
3301 tmp = L[best_row][k];
3302 L[best_row][k] = L[col][k];
3303 L[col][k] = tmp;
3304 }
3305 }
3306 // Now do the pivot itself.
3307 for (row = col + 1; row < size; ++row) {
3308 double ratio = -U[row][col] / U[col][col];
3309 for (int j = col; j < size; ++j) {
3310 U[row][j] += U[col][j] * ratio;
3311 }
3312 for (int k = 0; k < size; ++k) {
3313 L[row][k] += L[col][k] * ratio;
3314 }
3315 }
3316 }
3317 // Next invert U.
3318 for (col = 0; col < size; ++col) {
3319 U_inv[col][col] = 1.0 / U[col][col];
3320 for (row = col - 1; row >= 0; --row) {
3321 double total = 0.0;
3322 for (int k = col; k > row; --k) {
3323 total += U[row][k] * U_inv[k][col];
3324 }
3325 U_inv[row][col] = -total / U[row][row];
3326 }
3327 }
3328 // Now the answer is U_inv.L.
3329 for (row = 0; row < size; row++) {
3330 for (col = 0; col < size; col++) {
3331 double sum = 0.0;
3332 for (int k = row; k < size; ++k) {
3333 sum += U_inv[row][k] * L[k][col];
3334 }
3335 inv[row * size + col] = sum;
3336 }
3337 }
3338 // Check matrix product.
3339 double error_sum = 0.0;
3340 for (row = 0; row < size; row++) {
3341 for (col = 0; col < size; col++) {
3342 double sum = 0.0;
3343 for (int k = 0; k < size; ++k) {
3344 sum += static_cast<double>(input[row * size + k]) * inv[k * size + col];
3345 }
3346 if (row != col) {
3347 error_sum += Abs(sum);
3348 }
3349 }
3350 }
3351 return error_sum;
3352 }
3353
3354 } // namespace tesseract
3355