1 /******************************************************************************
2  ** Filename: cluster.cpp
3  ** Purpose:  Routines for clustering points in N-D space
4  ** Author:   Dan Johnson
5  **
6  ** (c) Copyright Hewlett-Packard Company, 1988.
7  ** Licensed under the Apache License, Version 2.0 (the "License");
8  ** you may not use this file except in compliance with the License.
9  ** You may obtain a copy of the License at
10  ** http://www.apache.org/licenses/LICENSE-2.0
11  ** Unless required by applicable law or agreed to in writing, software
12  ** distributed under the License is distributed on an "AS IS" BASIS,
13  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  ** See the License for the specific language governing permissions and
15  ** limitations under the License.
16  *****************************************************************************/
17 
18 #define _USE_MATH_DEFINES // for M_PI
19 
20 #include "cluster.h"
21 
22 #include "genericheap.h"
23 #include "kdpair.h"
24 #include "matrix.h"
25 #include "tprintf.h"
26 
27 #include "helpers.h"
28 
29 #include <cfloat> // for FLT_MAX
30 #include <cmath>  // for M_PI
31 #include <array>  // for std::array
32 #include <vector> // for std::vector
33 
34 namespace tesseract {
35 
36 #define HOTELLING 1  // If true use Hotelling's test to decide where to split.
37 #define FTABLE_X 10  // Size of FTable.
38 #define FTABLE_Y 100 // Size of FTable.
39 
40 // Table of values approximating the cumulative F-distribution for a confidence
41 // of 1%.
42 const double FTable[FTABLE_Y][FTABLE_X] = {
43     {
44         4052.19,
45         4999.52,
46         5403.34,
47         5624.62,
48         5763.65,
49         5858.97,
50         5928.33,
51         5981.10,
52         6022.50,
53         6055.85,
54     },
55     {
56         98.502,
57         99.000,
58         99.166,
59         99.249,
60         99.300,
61         99.333,
62         99.356,
63         99.374,
64         99.388,
65         99.399,
66     },
67     {
68         34.116,
69         30.816,
70         29.457,
71         28.710,
72         28.237,
73         27.911,
74         27.672,
75         27.489,
76         27.345,
77         27.229,
78     },
79     {
80         21.198,
81         18.000,
82         16.694,
83         15.977,
84         15.522,
85         15.207,
86         14.976,
87         14.799,
88         14.659,
89         14.546,
90     },
91     {
92         16.258,
93         13.274,
94         12.060,
95         11.392,
96         10.967,
97         10.672,
98         10.456,
99         10.289,
100         10.158,
101         10.051,
102     },
103     {
104         13.745,
105         10.925,
106         9.780,
107         9.148,
108         8.746,
109         8.466,
110         8.260,
111         8.102,
112         7.976,
113         7.874,
114     },
115     {
116         12.246,
117         9.547,
118         8.451,
119         7.847,
120         7.460,
121         7.191,
122         6.993,
123         6.840,
124         6.719,
125         6.620,
126     },
127     {
128         11.259,
129         8.649,
130         7.591,
131         7.006,
132         6.632,
133         6.371,
134         6.178,
135         6.029,
136         5.911,
137         5.814,
138     },
139     {
140         10.561,
141         8.022,
142         6.992,
143         6.422,
144         6.057,
145         5.802,
146         5.613,
147         5.467,
148         5.351,
149         5.257,
150     },
151     {
152         10.044,
153         7.559,
154         6.552,
155         5.994,
156         5.636,
157         5.386,
158         5.200,
159         5.057,
160         4.942,
161         4.849,
162     },
163     {
164         9.646,
165         7.206,
166         6.217,
167         5.668,
168         5.316,
169         5.069,
170         4.886,
171         4.744,
172         4.632,
173         4.539,
174     },
175     {
176         9.330,
177         6.927,
178         5.953,
179         5.412,
180         5.064,
181         4.821,
182         4.640,
183         4.499,
184         4.388,
185         4.296,
186     },
187     {
188         9.074,
189         6.701,
190         5.739,
191         5.205,
192         4.862,
193         4.620,
194         4.441,
195         4.302,
196         4.191,
197         4.100,
198     },
199     {
200         8.862,
201         6.515,
202         5.564,
203         5.035,
204         4.695,
205         4.456,
206         4.278,
207         4.140,
208         4.030,
209         3.939,
210     },
211     {
212         8.683,
213         6.359,
214         5.417,
215         4.893,
216         4.556,
217         4.318,
218         4.142,
219         4.004,
220         3.895,
221         3.805,
222     },
223     {
224         8.531,
225         6.226,
226         5.292,
227         4.773,
228         4.437,
229         4.202,
230         4.026,
231         3.890,
232         3.780,
233         3.691,
234     },
235     {
236         8.400,
237         6.112,
238         5.185,
239         4.669,
240         4.336,
241         4.102,
242         3.927,
243         3.791,
244         3.682,
245         3.593,
246     },
247     {
248         8.285,
249         6.013,
250         5.092,
251         4.579,
252         4.248,
253         4.015,
254         3.841,
255         3.705,
256         3.597,
257         3.508,
258     },
259     {
260         8.185,
261         5.926,
262         5.010,
263         4.500,
264         4.171,
265         3.939,
266         3.765,
267         3.631,
268         3.523,
269         3.434,
270     },
271     {
272         8.096,
273         5.849,
274         4.938,
275         4.431,
276         4.103,
277         3.871,
278         3.699,
279         3.564,
280         3.457,
281         3.368,
282     },
283     {
284         8.017,
285         5.780,
286         4.874,
287         4.369,
288         4.042,
289         3.812,
290         3.640,
291         3.506,
292         3.398,
293         3.310,
294     },
295     {
296         7.945,
297         5.719,
298         4.817,
299         4.313,
300         3.988,
301         3.758,
302         3.587,
303         3.453,
304         3.346,
305         3.258,
306     },
307     {
308         7.881,
309         5.664,
310         4.765,
311         4.264,
312         3.939,
313         3.710,
314         3.539,
315         3.406,
316         3.299,
317         3.211,
318     },
319     {
320         7.823,
321         5.614,
322         4.718,
323         4.218,
324         3.895,
325         3.667,
326         3.496,
327         3.363,
328         3.256,
329         3.168,
330     },
331     {
332         7.770,
333         5.568,
334         4.675,
335         4.177,
336         3.855,
337         3.627,
338         3.457,
339         3.324,
340         3.217,
341         3.129,
342     },
343     {
344         7.721,
345         5.526,
346         4.637,
347         4.140,
348         3.818,
349         3.591,
350         3.421,
351         3.288,
352         3.182,
353         3.094,
354     },
355     {
356         7.677,
357         5.488,
358         4.601,
359         4.106,
360         3.785,
361         3.558,
362         3.388,
363         3.256,
364         3.149,
365         3.062,
366     },
367     {
368         7.636,
369         5.453,
370         4.568,
371         4.074,
372         3.754,
373         3.528,
374         3.358,
375         3.226,
376         3.120,
377         3.032,
378     },
379     {
380         7.598,
381         5.420,
382         4.538,
383         4.045,
384         3.725,
385         3.499,
386         3.330,
387         3.198,
388         3.092,
389         3.005,
390     },
391     {
392         7.562,
393         5.390,
394         4.510,
395         4.018,
396         3.699,
397         3.473,
398         3.305,
399         3.173,
400         3.067,
401         2.979,
402     },
403     {
404         7.530,
405         5.362,
406         4.484,
407         3.993,
408         3.675,
409         3.449,
410         3.281,
411         3.149,
412         3.043,
413         2.955,
414     },
415     {
416         7.499,
417         5.336,
418         4.459,
419         3.969,
420         3.652,
421         3.427,
422         3.258,
423         3.127,
424         3.021,
425         2.934,
426     },
427     {
428         7.471,
429         5.312,
430         4.437,
431         3.948,
432         3.630,
433         3.406,
434         3.238,
435         3.106,
436         3.000,
437         2.913,
438     },
439     {
440         7.444,
441         5.289,
442         4.416,
443         3.927,
444         3.611,
445         3.386,
446         3.218,
447         3.087,
448         2.981,
449         2.894,
450     },
451     {
452         7.419,
453         5.268,
454         4.396,
455         3.908,
456         3.592,
457         3.368,
458         3.200,
459         3.069,
460         2.963,
461         2.876,
462     },
463     {
464         7.396,
465         5.248,
466         4.377,
467         3.890,
468         3.574,
469         3.351,
470         3.183,
471         3.052,
472         2.946,
473         2.859,
474     },
475     {
476         7.373,
477         5.229,
478         4.360,
479         3.873,
480         3.558,
481         3.334,
482         3.167,
483         3.036,
484         2.930,
485         2.843,
486     },
487     {
488         7.353,
489         5.211,
490         4.343,
491         3.858,
492         3.542,
493         3.319,
494         3.152,
495         3.021,
496         2.915,
497         2.828,
498     },
499     {
500         7.333,
501         5.194,
502         4.327,
503         3.843,
504         3.528,
505         3.305,
506         3.137,
507         3.006,
508         2.901,
509         2.814,
510     },
511     {
512         7.314,
513         5.179,
514         4.313,
515         3.828,
516         3.514,
517         3.291,
518         3.124,
519         2.993,
520         2.888,
521         2.801,
522     },
523     {
524         7.296,
525         5.163,
526         4.299,
527         3.815,
528         3.501,
529         3.278,
530         3.111,
531         2.980,
532         2.875,
533         2.788,
534     },
535     {
536         7.280,
537         5.149,
538         4.285,
539         3.802,
540         3.488,
541         3.266,
542         3.099,
543         2.968,
544         2.863,
545         2.776,
546     },
547     {
548         7.264,
549         5.136,
550         4.273,
551         3.790,
552         3.476,
553         3.254,
554         3.087,
555         2.957,
556         2.851,
557         2.764,
558     },
559     {
560         7.248,
561         5.123,
562         4.261,
563         3.778,
564         3.465,
565         3.243,
566         3.076,
567         2.946,
568         2.840,
569         2.754,
570     },
571     {
572         7.234,
573         5.110,
574         4.249,
575         3.767,
576         3.454,
577         3.232,
578         3.066,
579         2.935,
580         2.830,
581         2.743,
582     },
583     {
584         7.220,
585         5.099,
586         4.238,
587         3.757,
588         3.444,
589         3.222,
590         3.056,
591         2.925,
592         2.820,
593         2.733,
594     },
595     {
596         7.207,
597         5.087,
598         4.228,
599         3.747,
600         3.434,
601         3.213,
602         3.046,
603         2.916,
604         2.811,
605         2.724,
606     },
607     {
608         7.194,
609         5.077,
610         4.218,
611         3.737,
612         3.425,
613         3.204,
614         3.037,
615         2.907,
616         2.802,
617         2.715,
618     },
619     {
620         7.182,
621         5.066,
622         4.208,
623         3.728,
624         3.416,
625         3.195,
626         3.028,
627         2.898,
628         2.793,
629         2.706,
630     },
631     {
632         7.171,
633         5.057,
634         4.199,
635         3.720,
636         3.408,
637         3.186,
638         3.020,
639         2.890,
640         2.785,
641         2.698,
642     },
643     {
644         7.159,
645         5.047,
646         4.191,
647         3.711,
648         3.400,
649         3.178,
650         3.012,
651         2.882,
652         2.777,
653         2.690,
654     },
655     {
656         7.149,
657         5.038,
658         4.182,
659         3.703,
660         3.392,
661         3.171,
662         3.005,
663         2.874,
664         2.769,
665         2.683,
666     },
667     {
668         7.139,
669         5.030,
670         4.174,
671         3.695,
672         3.384,
673         3.163,
674         2.997,
675         2.867,
676         2.762,
677         2.675,
678     },
679     {
680         7.129,
681         5.021,
682         4.167,
683         3.688,
684         3.377,
685         3.156,
686         2.990,
687         2.860,
688         2.755,
689         2.668,
690     },
691     {
692         7.119,
693         5.013,
694         4.159,
695         3.681,
696         3.370,
697         3.149,
698         2.983,
699         2.853,
700         2.748,
701         2.662,
702     },
703     {
704         7.110,
705         5.006,
706         4.152,
707         3.674,
708         3.363,
709         3.143,
710         2.977,
711         2.847,
712         2.742,
713         2.655,
714     },
715     {
716         7.102,
717         4.998,
718         4.145,
719         3.667,
720         3.357,
721         3.136,
722         2.971,
723         2.841,
724         2.736,
725         2.649,
726     },
727     {
728         7.093,
729         4.991,
730         4.138,
731         3.661,
732         3.351,
733         3.130,
734         2.965,
735         2.835,
736         2.730,
737         2.643,
738     },
739     {
740         7.085,
741         4.984,
742         4.132,
743         3.655,
744         3.345,
745         3.124,
746         2.959,
747         2.829,
748         2.724,
749         2.637,
750     },
751     {
752         7.077,
753         4.977,
754         4.126,
755         3.649,
756         3.339,
757         3.119,
758         2.953,
759         2.823,
760         2.718,
761         2.632,
762     },
763     {
764         7.070,
765         4.971,
766         4.120,
767         3.643,
768         3.333,
769         3.113,
770         2.948,
771         2.818,
772         2.713,
773         2.626,
774     },
775     {
776         7.062,
777         4.965,
778         4.114,
779         3.638,
780         3.328,
781         3.108,
782         2.942,
783         2.813,
784         2.708,
785         2.621,
786     },
787     {
788         7.055,
789         4.959,
790         4.109,
791         3.632,
792         3.323,
793         3.103,
794         2.937,
795         2.808,
796         2.703,
797         2.616,
798     },
799     {
800         7.048,
801         4.953,
802         4.103,
803         3.627,
804         3.318,
805         3.098,
806         2.932,
807         2.803,
808         2.698,
809         2.611,
810     },
811     {
812         7.042,
813         4.947,
814         4.098,
815         3.622,
816         3.313,
817         3.093,
818         2.928,
819         2.798,
820         2.693,
821         2.607,
822     },
823     {
824         7.035,
825         4.942,
826         4.093,
827         3.618,
828         3.308,
829         3.088,
830         2.923,
831         2.793,
832         2.689,
833         2.602,
834     },
835     {
836         7.029,
837         4.937,
838         4.088,
839         3.613,
840         3.304,
841         3.084,
842         2.919,
843         2.789,
844         2.684,
845         2.598,
846     },
847     {
848         7.023,
849         4.932,
850         4.083,
851         3.608,
852         3.299,
853         3.080,
854         2.914,
855         2.785,
856         2.680,
857         2.593,
858     },
859     {
860         7.017,
861         4.927,
862         4.079,
863         3.604,
864         3.295,
865         3.075,
866         2.910,
867         2.781,
868         2.676,
869         2.589,
870     },
871     {
872         7.011,
873         4.922,
874         4.074,
875         3.600,
876         3.291,
877         3.071,
878         2.906,
879         2.777,
880         2.672,
881         2.585,
882     },
883     {
884         7.006,
885         4.917,
886         4.070,
887         3.596,
888         3.287,
889         3.067,
890         2.902,
891         2.773,
892         2.668,
893         2.581,
894     },
895     {
896         7.001,
897         4.913,
898         4.066,
899         3.591,
900         3.283,
901         3.063,
902         2.898,
903         2.769,
904         2.664,
905         2.578,
906     },
907     {
908         6.995,
909         4.908,
910         4.062,
911         3.588,
912         3.279,
913         3.060,
914         2.895,
915         2.765,
916         2.660,
917         2.574,
918     },
919     {
920         6.990,
921         4.904,
922         4.058,
923         3.584,
924         3.275,
925         3.056,
926         2.891,
927         2.762,
928         2.657,
929         2.570,
930     },
931     {
932         6.985,
933         4.900,
934         4.054,
935         3.580,
936         3.272,
937         3.052,
938         2.887,
939         2.758,
940         2.653,
941         2.567,
942     },
943     {
944         6.981,
945         4.896,
946         4.050,
947         3.577,
948         3.268,
949         3.049,
950         2.884,
951         2.755,
952         2.650,
953         2.563,
954     },
955     {
956         6.976,
957         4.892,
958         4.047,
959         3.573,
960         3.265,
961         3.046,
962         2.881,
963         2.751,
964         2.647,
965         2.560,
966     },
967     {
968         6.971,
969         4.888,
970         4.043,
971         3.570,
972         3.261,
973         3.042,
974         2.877,
975         2.748,
976         2.644,
977         2.557,
978     },
979     {
980         6.967,
981         4.884,
982         4.040,
983         3.566,
984         3.258,
985         3.039,
986         2.874,
987         2.745,
988         2.640,
989         2.554,
990     },
991     {
992         6.963,
993         4.881,
994         4.036,
995         3.563,
996         3.255,
997         3.036,
998         2.871,
999         2.742,
1000         2.637,
1001         2.551,
1002     },
1003     {
1004         6.958,
1005         4.877,
1006         4.033,
1007         3.560,
1008         3.252,
1009         3.033,
1010         2.868,
1011         2.739,
1012         2.634,
1013         2.548,
1014     },
1015     {
1016         6.954,
1017         4.874,
1018         4.030,
1019         3.557,
1020         3.249,
1021         3.030,
1022         2.865,
1023         2.736,
1024         2.632,
1025         2.545,
1026     },
1027     {
1028         6.950,
1029         4.870,
1030         4.027,
1031         3.554,
1032         3.246,
1033         3.027,
1034         2.863,
1035         2.733,
1036         2.629,
1037         2.542,
1038     },
1039     {
1040         6.947,
1041         4.867,
1042         4.024,
1043         3.551,
1044         3.243,
1045         3.025,
1046         2.860,
1047         2.731,
1048         2.626,
1049         2.539,
1050     },
1051     {
1052         6.943,
1053         4.864,
1054         4.021,
1055         3.548,
1056         3.240,
1057         3.022,
1058         2.857,
1059         2.728,
1060         2.623,
1061         2.537,
1062     },
1063     {
1064         6.939,
1065         4.861,
1066         4.018,
1067         3.545,
1068         3.238,
1069         3.019,
1070         2.854,
1071         2.725,
1072         2.621,
1073         2.534,
1074     },
1075     {
1076         6.935,
1077         4.858,
1078         4.015,
1079         3.543,
1080         3.235,
1081         3.017,
1082         2.852,
1083         2.723,
1084         2.618,
1085         2.532,
1086     },
1087     {
1088         6.932,
1089         4.855,
1090         4.012,
1091         3.540,
1092         3.233,
1093         3.014,
1094         2.849,
1095         2.720,
1096         2.616,
1097         2.529,
1098     },
1099     {
1100         6.928,
1101         4.852,
1102         4.010,
1103         3.538,
1104         3.230,
1105         3.012,
1106         2.847,
1107         2.718,
1108         2.613,
1109         2.527,
1110     },
1111     {
1112         6.925,
1113         4.849,
1114         4.007,
1115         3.535,
1116         3.228,
1117         3.009,
1118         2.845,
1119         2.715,
1120         2.611,
1121         2.524,
1122     },
1123     {
1124         6.922,
1125         4.846,
1126         4.004,
1127         3.533,
1128         3.225,
1129         3.007,
1130         2.842,
1131         2.713,
1132         2.609,
1133         2.522,
1134     },
1135     {
1136         6.919,
1137         4.844,
1138         4.002,
1139         3.530,
1140         3.223,
1141         3.004,
1142         2.840,
1143         2.711,
1144         2.606,
1145         2.520,
1146     },
1147     {
1148         6.915,
1149         4.841,
1150         3.999,
1151         3.528,
1152         3.221,
1153         3.002,
1154         2.838,
1155         2.709,
1156         2.604,
1157         2.518,
1158     },
1159     {
1160         6.912,
1161         4.838,
1162         3.997,
1163         3.525,
1164         3.218,
1165         3.000,
1166         2.835,
1167         2.706,
1168         2.602,
1169         2.515,
1170     },
1171     {
1172         6.909,
1173         4.836,
1174         3.995,
1175         3.523,
1176         3.216,
1177         2.998,
1178         2.833,
1179         2.704,
1180         2.600,
1181         2.513,
1182     },
1183     {
1184         6.906,
1185         4.833,
1186         3.992,
1187         3.521,
1188         3.214,
1189         2.996,
1190         2.831,
1191         2.702,
1192         2.598,
1193         2.511,
1194     },
1195     {
1196         6.904,
1197         4.831,
1198         3.990,
1199         3.519,
1200         3.212,
1201         2.994,
1202         2.829,
1203         2.700,
1204         2.596,
1205         2.509,
1206     },
1207     {
1208         6.901,
1209         4.829,
1210         3.988,
1211         3.517,
1212         3.210,
1213         2.992,
1214         2.827,
1215         2.698,
1216         2.594,
1217         2.507,
1218     },
1219     {
1220         6.898,
1221         4.826,
1222         3.986,
1223         3.515,
1224         3.208,
1225         2.990,
1226         2.825,
1227         2.696,
1228         2.592,
1229         2.505,
1230     },
1231     {6.895, 4.824, 3.984, 3.513, 3.206, 2.988, 2.823, 2.694, 2.590, 2.503}};
1232 
1233 /** define the variance which will be used as a minimum variance for any
1234   dimension of any feature. Since most features are calculated from numbers
1235   with a precision no better than 1 in 128, the variance should never be
1236   less than the square of this number for parameters whose range is 1. */
1237 #define MINVARIANCE 0.0004
1238 
1239 /** define the absolute minimum number of samples which must be present in
1240   order to accurately test hypotheses about underlying probability
1241   distributions.  Define separately the minimum samples that are needed
1242   before a statistical analysis is attempted; this number should be
1243   equal to MINSAMPLES but can be set to a lower number for early testing
1244   when very few samples are available. */
1245 #define MINSAMPLESPERBUCKET 5
1246 #define MINSAMPLES (MINBUCKETS * MINSAMPLESPERBUCKET)
1247 #define MINSAMPLESNEEDED 1
1248 
1249 /** define the size of the table which maps normalized samples to
1250   histogram buckets.  Also define the number of standard deviations
1251   in a normal distribution which are considered to be significant.
1252   The mapping table will be defined in such a way that it covers
1253   the specified number of standard deviations on either side of
1254   the mean.  BUCKETTABLESIZE should always be even. */
1255 #define BUCKETTABLESIZE 1024
1256 #define NORMALEXTENT 3.0
1257 
1258 struct TEMPCLUSTER {
1259   CLUSTER *Cluster;
1260   CLUSTER *Neighbor;
1261 };
1262 
1263 using ClusterPair = tesseract::KDPairInc<float, TEMPCLUSTER *>;
1264 using ClusterHeap = tesseract::GenericHeap<ClusterPair>;
1265 
1266 struct STATISTICS {
STATISTICStesseract::STATISTICS1267   STATISTICS(size_t n) : CoVariance(n * n), Min(n), Max(n) {
1268   }
1269   float AvgVariance = 1.0f;
1270   std::vector<float> CoVariance;
1271   std::vector<float> Min; // largest negative distance from the mean
1272   std::vector<float> Max; // largest positive distance from the mean
1273 };
1274 
1275 struct BUCKETS {
BUCKETStesseract::BUCKETS1276   BUCKETS(size_t n) : NumberOfBuckets(n), Count(n), ExpectedCount(n) {
1277   }
~BUCKETStesseract::BUCKETS1278   ~BUCKETS() {
1279   }
1280   DISTRIBUTION Distribution = normal; // distribution being tested for
1281   uint32_t SampleCount = 0;         // # of samples in histogram
1282   double Confidence = 0.0;          // confidence level of test
1283   double ChiSquared = 0.0;          // test threshold
1284   uint16_t NumberOfBuckets;         // number of cells in histogram
1285   uint16_t Bucket[BUCKETTABLESIZE]; // mapping to histogram buckets
1286   std::vector<uint32_t> Count;      // frequency of occurrence histogram
1287   std::vector<float> ExpectedCount; // expected histogram
1288 };
1289 
1290 struct CHISTRUCT {
1291   /// This constructor allocates a new data structure which is used
1292   /// to hold a chi-squared value along with its associated
1293   /// number of degrees of freedom and alpha value.
1294   ///
1295   /// @param degreesOfFreedom  degrees of freedom for new chi value
1296   /// @param alpha     confidence level for new chi value
CHISTRUCTtesseract::CHISTRUCT1297   CHISTRUCT(uint16_t degreesOfFreedom, double alpha) : DegreesOfFreedom(degreesOfFreedom), Alpha(alpha) {
1298   }
1299   uint16_t DegreesOfFreedom = 0;
1300   double Alpha = 0.0;
1301   double ChiSquared = 0.0;
1302 };
1303 
1304 // For use with KDWalk / MakePotentialClusters
1305 struct ClusteringContext {
1306   ClusterHeap *heap;       // heap used to hold temp clusters, "best" on top
1307   TEMPCLUSTER *candidates; // array of potential clusters
1308   KDTREE *tree;            // kd-tree to be searched for neighbors
1309   int32_t next;            // next candidate to be used
1310 };
1311 
1312 using DENSITYFUNC = double (*)(int32_t);
1313 using SOLVEFUNC = double (*)(CHISTRUCT *, double);
1314 
1315 #define Odd(N) ((N) % 2)
1316 #define Mirror(N, R) ((R) - (N)-1)
1317 #define Abs(N) (((N) < 0) ? (-(N)) : (N))
1318 
1319 //--------------Global Data Definitions and Declarations----------------------
1320 /** the following variables describe a discrete normal distribution
1321   which is used by NormalDensity() and NormalBucket().  The
1322   constant NORMALEXTENT determines how many standard
1323   deviations of the distribution are mapped onto the fixed
1324   discrete range of x.  x=0 is mapped to -NORMALEXTENT standard
1325   deviations and x=BUCKETTABLESIZE is mapped to
1326   +NORMALEXTENT standard deviations. */
1327 #define SqrtOf2Pi 2.506628275
1328 static const double kNormalStdDev = BUCKETTABLESIZE / (2.0 * NORMALEXTENT);
1329 static const double kNormalVariance =
1330     (BUCKETTABLESIZE * BUCKETTABLESIZE) / (4.0 * NORMALEXTENT * NORMALEXTENT);
1331 static const double kNormalMagnitude = (2.0 * NORMALEXTENT) / (SqrtOf2Pi * BUCKETTABLESIZE);
1332 static const double kNormalMean = BUCKETTABLESIZE / 2;
1333 
1334 /** define lookup tables used to compute the number of histogram buckets
1335   that should be used for a given number of samples. */
1336 #define LOOKUPTABLESIZE 8
1337 #define MAXDEGREESOFFREEDOM MAXBUCKETS
1338 
1339 static const uint32_t kCountTable[LOOKUPTABLESIZE] = {MINSAMPLES, 200,  400, 600, 800,
1340                                                       1000,       1500, 2000}; // number of samples
1341 
1342 static const uint16_t kBucketsTable[LOOKUPTABLESIZE] = {
1343     MINBUCKETS, 16, 20, 24, 27, 30, 35, MAXBUCKETS}; // number of buckets
1344 
1345 /*-------------------------------------------------------------------------
1346           Private Function Prototypes
1347 --------------------------------------------------------------------------*/
1348 static void CreateClusterTree(CLUSTERER *Clusterer);
1349 
1350 static void MakePotentialClusters(ClusteringContext *context, CLUSTER *Cluster, int32_t Level);
1351 
1352 static CLUSTER *FindNearestNeighbor(KDTREE *Tree, CLUSTER *Cluster, float *Distance);
1353 
1354 static CLUSTER *MakeNewCluster(CLUSTERER *Clusterer, TEMPCLUSTER *TempCluster);
1355 
1356 static void ComputePrototypes(CLUSTERER *Clusterer, CLUSTERCONFIG *Config);
1357 
1358 static PROTOTYPE *MakePrototype(CLUSTERER *Clusterer, CLUSTERCONFIG *Config, CLUSTER *Cluster);
1359 
1360 static PROTOTYPE *MakeDegenerateProto(uint16_t N, CLUSTER *Cluster, STATISTICS *Statistics,
1361                                       PROTOSTYLE Style, int32_t MinSamples);
1362 
1363 static PROTOTYPE *TestEllipticalProto(CLUSTERER *Clusterer, CLUSTERCONFIG *Config, CLUSTER *Cluster,
1364                                       STATISTICS *Statistics);
1365 
1366 static PROTOTYPE *MakeSphericalProto(CLUSTERER *Clusterer, CLUSTER *Cluster, STATISTICS *Statistics,
1367                                      BUCKETS *Buckets);
1368 
1369 static PROTOTYPE *MakeEllipticalProto(CLUSTERER *Clusterer, CLUSTER *Cluster,
1370                                       STATISTICS *Statistics, BUCKETS *Buckets);
1371 
1372 static PROTOTYPE *MakeMixedProto(CLUSTERER *Clusterer, CLUSTER *Cluster, STATISTICS *Statistics,
1373                                  BUCKETS *NormalBuckets, double Confidence);
1374 
1375 static void MakeDimRandom(uint16_t i, PROTOTYPE *Proto, PARAM_DESC *ParamDesc);
1376 
1377 static void MakeDimUniform(uint16_t i, PROTOTYPE *Proto, STATISTICS *Statistics);
1378 
1379 static STATISTICS *ComputeStatistics(int16_t N, PARAM_DESC ParamDesc[], CLUSTER *Cluster);
1380 
1381 static PROTOTYPE *NewSphericalProto(uint16_t N, CLUSTER *Cluster, STATISTICS *Statistics);
1382 
1383 static PROTOTYPE *NewEllipticalProto(int16_t N, CLUSTER *Cluster, STATISTICS *Statistics);
1384 
1385 static PROTOTYPE *NewMixedProto(int16_t N, CLUSTER *Cluster, STATISTICS *Statistics);
1386 
1387 static PROTOTYPE *NewSimpleProto(int16_t N, CLUSTER *Cluster);
1388 
1389 static bool Independent(PARAM_DESC *ParamDesc, int16_t N, float *CoVariance, float Independence);
1390 
1391 static BUCKETS *GetBuckets(CLUSTERER *clusterer, DISTRIBUTION Distribution, uint32_t SampleCount,
1392                            double Confidence);
1393 
1394 static BUCKETS *MakeBuckets(DISTRIBUTION Distribution, uint32_t SampleCount, double Confidence);
1395 
1396 static uint16_t OptimumNumberOfBuckets(uint32_t SampleCount);
1397 
1398 static double ComputeChiSquared(uint16_t DegreesOfFreedom, double Alpha);
1399 
1400 static double NormalDensity(int32_t x);
1401 
1402 static double UniformDensity(int32_t x);
1403 
1404 static double Integral(double f1, double f2, double Dx);
1405 
1406 static void FillBuckets(BUCKETS *Buckets, CLUSTER *Cluster, uint16_t Dim, PARAM_DESC *ParamDesc,
1407                         float Mean, float StdDev);
1408 
1409 static uint16_t NormalBucket(PARAM_DESC *ParamDesc, float x, float Mean, float StdDev);
1410 
1411 static uint16_t UniformBucket(PARAM_DESC *ParamDesc, float x, float Mean, float StdDev);
1412 
1413 static bool DistributionOK(BUCKETS *Buckets);
1414 
1415 static uint16_t DegreesOfFreedom(DISTRIBUTION Distribution, uint16_t HistogramBuckets);
1416 
1417 static void AdjustBuckets(BUCKETS *Buckets, uint32_t NewSampleCount);
1418 
1419 static void InitBuckets(BUCKETS *Buckets);
1420 
1421 static int AlphaMatch(void *arg1,  // CHISTRUCT *ChiStruct,
1422                       void *arg2); // CHISTRUCT *SearchKey);
1423 
1424 static double Solve(SOLVEFUNC Function, void *FunctionParams, double InitialGuess, double Accuracy);
1425 
1426 static double ChiArea(CHISTRUCT *ChiParams, double x);
1427 
1428 static bool MultipleCharSamples(CLUSTERER *Clusterer, CLUSTER *Cluster, float MaxIllegal);
1429 
1430 static double InvertMatrix(const float *input, int size, float *inv);
1431 
1432 //--------------------------Public Code--------------------------------------
1433 /**
1434  * This routine creates a new clusterer data structure,
1435  * initializes it, and returns a pointer to it.
1436  *
1437  * @param SampleSize  number of dimensions in feature space
1438  * @param ParamDesc description of each dimension
1439  * @return pointer to the new clusterer data structure
1440  */
MakeClusterer(int16_t SampleSize,const PARAM_DESC ParamDesc[])1441 CLUSTERER *MakeClusterer(int16_t SampleSize, const PARAM_DESC ParamDesc[]) {
1442   int i;
1443 
1444   // allocate main clusterer data structure and init simple fields
1445   auto Clusterer = new CLUSTERER;
1446   Clusterer->SampleSize = SampleSize;
1447   Clusterer->NumberOfSamples = 0;
1448   Clusterer->NumChar = 0;
1449 
1450   // init fields which will not be used initially
1451   Clusterer->Root = nullptr;
1452   Clusterer->ProtoList = NIL_LIST;
1453 
1454   // maintain a copy of param descriptors in the clusterer data structure
1455   Clusterer->ParamDesc = new PARAM_DESC[SampleSize];
1456   for (i = 0; i < SampleSize; i++) {
1457     Clusterer->ParamDesc[i].Circular = ParamDesc[i].Circular;
1458     Clusterer->ParamDesc[i].NonEssential = ParamDesc[i].NonEssential;
1459     Clusterer->ParamDesc[i].Min = ParamDesc[i].Min;
1460     Clusterer->ParamDesc[i].Max = ParamDesc[i].Max;
1461     Clusterer->ParamDesc[i].Range = ParamDesc[i].Max - ParamDesc[i].Min;
1462     Clusterer->ParamDesc[i].HalfRange = Clusterer->ParamDesc[i].Range / 2;
1463     Clusterer->ParamDesc[i].MidRange = (ParamDesc[i].Max + ParamDesc[i].Min) / 2;
1464   }
1465 
1466   // allocate a kd tree to hold the samples
1467   Clusterer->KDTree = MakeKDTree(SampleSize, ParamDesc);
1468 
1469   // Initialize cache of histogram buckets to minimize recomputing them.
1470   for (auto &d : Clusterer->bucket_cache) {
1471     for (auto &c : d) {
1472       c = nullptr;
1473     }
1474   }
1475 
1476   return Clusterer;
1477 } // MakeClusterer
1478 
1479 /**
1480  * This routine creates a new sample data structure to hold
1481  * the specified feature.  This sample is added to the clusterer
1482  * data structure (so that it knows which samples are to be
1483  * clustered later), and a pointer to the sample is returned to
1484  * the caller.
1485  *
1486  * @param Clusterer clusterer data structure to add sample to
1487  * @param Feature feature to be added to clusterer
1488  * @param CharID  unique ident. of char that sample came from
1489  *
1490  * @return    Pointer to the new sample data structure
1491  */
MakeSample(CLUSTERER * Clusterer,const float * Feature,uint32_t CharID)1492 SAMPLE *MakeSample(CLUSTERER *Clusterer, const float *Feature, uint32_t CharID) {
1493   int i;
1494 
1495   // see if the samples have already been clustered - if so trap an error
1496   // Can't add samples after they have been clustered.
1497   ASSERT_HOST(Clusterer->Root == nullptr);
1498 
1499   // allocate the new sample and initialize it
1500   auto Sample = new SAMPLE(Clusterer->SampleSize);
1501   Sample->Clustered = false;
1502   Sample->Prototype = false;
1503   Sample->SampleCount = 1;
1504   Sample->Left = nullptr;
1505   Sample->Right = nullptr;
1506   Sample->CharID = CharID;
1507 
1508   for (i = 0; i < Clusterer->SampleSize; i++) {
1509     Sample->Mean[i] = Feature[i];
1510   }
1511 
1512   // add the sample to the KD tree - keep track of the total # of samples
1513   Clusterer->NumberOfSamples++;
1514   KDStore(Clusterer->KDTree, &Sample->Mean[0], Sample);
1515   if (CharID >= Clusterer->NumChar) {
1516     Clusterer->NumChar = CharID + 1;
1517   }
1518 
1519   // execute hook for monitoring clustering operation
1520   // (*SampleCreationHook)(Sample);
1521 
1522   return (Sample);
1523 } // MakeSample
1524 
1525 /**
1526  * This routine first checks to see if the samples in this
1527  * clusterer have already been clustered before; if so, it does
1528  * not bother to recreate the cluster tree.  It simply recomputes
1529  * the prototypes based on the new Config info.
1530  *
1531  * If the samples have not been clustered before, the
1532  * samples in the KD tree are formed into a cluster tree and then
1533  * the prototypes are computed from the cluster tree.
1534  *
1535  * In either case this routine returns a pointer to a
1536  * list of prototypes that best represent the samples given
1537  * the constraints specified in Config.
1538  *
1539  * @param Clusterer data struct containing samples to be clustered
1540  * @param Config  parameters which control clustering process
1541  *
1542  * @return Pointer to a list of prototypes
1543  */
ClusterSamples(CLUSTERER * Clusterer,CLUSTERCONFIG * Config)1544 LIST ClusterSamples(CLUSTERER *Clusterer, CLUSTERCONFIG *Config) {
1545   // only create cluster tree if samples have never been clustered before
1546   if (Clusterer->Root == nullptr) {
1547     CreateClusterTree(Clusterer);
1548   }
1549 
1550   // deallocate the old prototype list if one exists
1551   FreeProtoList(&Clusterer->ProtoList);
1552   Clusterer->ProtoList = NIL_LIST;
1553 
1554   // compute prototypes starting at the root node in the tree
1555   ComputePrototypes(Clusterer, Config);
1556   // We don't need the cluster pointers in the protos any more, so null them
1557   // out, which makes it safe to delete the clusterer.
1558   LIST proto_list = Clusterer->ProtoList;
1559   iterate(proto_list) {
1560     auto *proto = reinterpret_cast<PROTOTYPE *>(proto_list->first_node());
1561     proto->Cluster = nullptr;
1562   }
1563   return Clusterer->ProtoList;
1564 } // ClusterSamples
1565 
1566 /**
1567  * This routine frees all of the memory allocated to the
1568  * specified data structure.  It will not, however, free
1569  * the memory used by the prototype list.  The pointers to
1570  * the clusters for each prototype in the list will be set
1571  * to nullptr to indicate that the cluster data structures no
1572  * longer exist.  Any sample lists that have been obtained
1573  * via calls to GetSamples are no longer valid.
1574  * @param Clusterer pointer to data structure to be freed
1575  */
FreeClusterer(CLUSTERER * Clusterer)1576 void FreeClusterer(CLUSTERER *Clusterer) {
1577   if (Clusterer != nullptr) {
1578     delete[] Clusterer->ParamDesc;
1579     delete Clusterer->KDTree;
1580     delete Clusterer->Root;
1581     // Free up all used buckets structures.
1582     for (auto &d : Clusterer->bucket_cache) {
1583       for (auto &c : d) {
1584         delete c;
1585       }
1586     }
1587 
1588     delete Clusterer;
1589   }
1590 } // FreeClusterer
1591 
1592 /**
1593  * This routine frees all of the memory allocated to the
1594  * specified list of prototypes.  The clusters which are
1595  * pointed to by the prototypes are not freed.
1596  * @param ProtoList pointer to list of prototypes to be freed
1597  */
FreeProtoList(LIST * ProtoList)1598 void FreeProtoList(LIST *ProtoList) {
1599   destroy_nodes(*ProtoList, FreePrototype);
1600 } // FreeProtoList
1601 
1602 /**
1603  * This routine deallocates the memory consumed by the specified
1604  * prototype and modifies the corresponding cluster so that it
1605  * is no longer marked as a prototype.  The cluster is NOT
1606  * deallocated by this routine.
1607  * @param arg prototype data structure to be deallocated
1608  */
FreePrototype(void * arg)1609 void FreePrototype(void *arg) { // PROTOTYPE     *Prototype)
1610   auto *Prototype = static_cast<PROTOTYPE *>(arg);
1611 
1612   // unmark the corresponding cluster (if there is one
1613   if (Prototype->Cluster != nullptr) {
1614     Prototype->Cluster->Prototype = false;
1615   }
1616 
1617   // deallocate the prototype statistics and then the prototype itself
1618   if (Prototype->Style != spherical) {
1619     delete[] Prototype->Variance.Elliptical;
1620     delete[] Prototype->Magnitude.Elliptical;
1621     delete[] Prototype->Weight.Elliptical;
1622   }
1623   delete Prototype;
1624 } // FreePrototype
1625 
1626 /**
1627  * This routine is used to find all of the samples which
1628  * belong to a cluster.  It starts by removing the top
1629  * cluster on the cluster list (SearchState).  If this cluster is
1630  * a leaf it is returned.  Otherwise, the right subcluster
1631  * is pushed on the list and we continue the search in the
1632  * left subcluster.  This continues until a leaf is found.
1633  * If all samples have been found, nullptr is returned.
1634  * InitSampleSearch() must be called
1635  * before NextSample() to initialize the search.
1636  * @param SearchState ptr to list containing clusters to be searched
1637  * @return  Pointer to the next leaf cluster (sample) or nullptr.
1638  */
NextSample(LIST * SearchState)1639 CLUSTER *NextSample(LIST *SearchState) {
1640   CLUSTER *Cluster;
1641 
1642   if (*SearchState == NIL_LIST) {
1643     return (nullptr);
1644   }
1645   Cluster = reinterpret_cast<CLUSTER *>((*SearchState)->first_node());
1646   *SearchState = pop(*SearchState);
1647   for (;;) {
1648     if (Cluster->Left == nullptr) {
1649       return (Cluster);
1650     }
1651     *SearchState = push(*SearchState, Cluster->Right);
1652     Cluster = Cluster->Left;
1653   }
1654 } // NextSample
1655 
1656 /**
1657  * This routine returns the mean of the specified
1658  * prototype in the indicated dimension.
1659  * @param Proto prototype to return mean of
1660  * @param Dimension dimension whose mean is to be returned
1661  * @return  Mean of Prototype in Dimension
1662  */
Mean(PROTOTYPE * Proto,uint16_t Dimension)1663 float Mean(PROTOTYPE *Proto, uint16_t Dimension) {
1664   return (Proto->Mean[Dimension]);
1665 } // Mean
1666 
1667 /**
1668  * This routine returns the standard deviation of the
1669  * prototype in the indicated dimension.
1670  * @param Proto   prototype to return standard deviation of
1671  * @param Dimension dimension whose stddev is to be returned
1672  * @return  Standard deviation of Prototype in Dimension
1673  */
StandardDeviation(PROTOTYPE * Proto,uint16_t Dimension)1674 float StandardDeviation(PROTOTYPE *Proto, uint16_t Dimension) {
1675   switch (Proto->Style) {
1676     case spherical:
1677       return std::sqrt(Proto->Variance.Spherical);
1678     case elliptical:
1679       return std::sqrt(Proto->Variance.Elliptical[Dimension]);
1680     case mixed:
1681       switch (Proto->Distrib[Dimension]) {
1682         case normal:
1683           return std::sqrt(Proto->Variance.Elliptical[Dimension]);
1684         case uniform:
1685         case D_random:
1686           return Proto->Variance.Elliptical[Dimension];
1687         case DISTRIBUTION_COUNT:
1688           ASSERT_HOST(!"Distribution count not allowed!");
1689       }
1690   }
1691   return 0.0f;
1692 } // StandardDeviation
1693 
1694 /*---------------------------------------------------------------------------
1695             Private Code
1696 ----------------------------------------------------------------------------*/
1697 /**
1698  * This routine performs a bottoms-up clustering on the samples
1699  * held in the kd-tree of the Clusterer data structure.  The
1700  * result is a cluster tree.  Each node in the tree represents
1701  * a cluster which conceptually contains a subset of the samples.
1702  * More precisely, the cluster contains all of the samples which
1703  * are contained in its two sub-clusters.  The leaves of the
1704  * tree are the individual samples themselves; they have no
1705  * sub-clusters.  The root node of the tree conceptually contains
1706  * all of the samples.
1707  * The Clusterer data structure is changed.
1708  * @param Clusterer data structure holdings samples to be clustered
1709  */
CreateClusterTree(CLUSTERER * Clusterer)1710 static void CreateClusterTree(CLUSTERER *Clusterer) {
1711   ClusteringContext context;
1712   ClusterPair HeapEntry;
1713 
1714   // each sample and its nearest neighbor form a "potential" cluster
1715   // save these in a heap with the "best" potential clusters on top
1716   context.tree = Clusterer->KDTree;
1717   context.candidates = new TEMPCLUSTER[Clusterer->NumberOfSamples];
1718   context.next = 0;
1719   context.heap = new ClusterHeap(Clusterer->NumberOfSamples);
1720   KDWalk(context.tree, reinterpret_cast<void_proc>(MakePotentialClusters), &context);
1721 
1722   // form potential clusters into actual clusters - always do "best" first
1723   while (context.heap->Pop(&HeapEntry)) {
1724     TEMPCLUSTER *PotentialCluster = HeapEntry.data();
1725 
1726     // if main cluster of potential cluster is already in another cluster
1727     // then we don't need to worry about it
1728     if (PotentialCluster->Cluster->Clustered) {
1729       continue;
1730     }
1731 
1732     // if main cluster is not yet clustered, but its nearest neighbor is
1733     // then we must find a new nearest neighbor
1734     else if (PotentialCluster->Neighbor->Clustered) {
1735       PotentialCluster->Neighbor =
1736           FindNearestNeighbor(context.tree, PotentialCluster->Cluster, &HeapEntry.key());
1737       if (PotentialCluster->Neighbor != nullptr) {
1738         context.heap->Push(&HeapEntry);
1739       }
1740     }
1741 
1742     // if neither cluster is already clustered, form permanent cluster
1743     else {
1744       PotentialCluster->Cluster = MakeNewCluster(Clusterer, PotentialCluster);
1745       PotentialCluster->Neighbor =
1746           FindNearestNeighbor(context.tree, PotentialCluster->Cluster, &HeapEntry.key());
1747       if (PotentialCluster->Neighbor != nullptr) {
1748         context.heap->Push(&HeapEntry);
1749       }
1750     }
1751   }
1752 
1753   // the root node in the cluster tree is now the only node in the kd-tree
1754   Clusterer->Root = static_cast<CLUSTER *> RootOf(Clusterer->KDTree);
1755 
1756   // free up the memory used by the K-D tree, heap, and temp clusters
1757   delete context.tree;
1758   Clusterer->KDTree = nullptr;
1759   delete context.heap;
1760   delete[] context.candidates;
1761 } // CreateClusterTree
1762 
1763 /**
1764  * This routine is designed to be used in concert with the
1765  * KDWalk routine.  It will create a potential cluster for
1766  * each sample in the kd-tree that is being walked.  This
1767  * potential cluster will then be pushed on the heap.
1768  * @param context  ClusteringContext (see definition above)
1769  * @param Cluster  current cluster being visited in kd-tree walk
1770  * @param Level  level of this cluster in the kd-tree
1771  */
MakePotentialClusters(ClusteringContext * context,CLUSTER * Cluster,int32_t)1772 static void MakePotentialClusters(ClusteringContext *context, CLUSTER *Cluster, int32_t /*Level*/) {
1773   ClusterPair HeapEntry;
1774   int next = context->next;
1775   context->candidates[next].Cluster = Cluster;
1776   HeapEntry.data() = &(context->candidates[next]);
1777   context->candidates[next].Neighbor =
1778       FindNearestNeighbor(context->tree, context->candidates[next].Cluster, &HeapEntry.key());
1779   if (context->candidates[next].Neighbor != nullptr) {
1780     context->heap->Push(&HeapEntry);
1781     context->next++;
1782   }
1783 } // MakePotentialClusters
1784 
1785 /**
1786  * This routine searches the specified kd-tree for the nearest
1787  * neighbor of the specified cluster.  It actually uses the
1788  * kd routines to find the 2 nearest neighbors since one of them
1789  * will be the original cluster.  A pointer to the nearest
1790  * neighbor is returned, if it can be found, otherwise nullptr is
1791  * returned.  The distance between the 2 nodes is placed
1792  * in the specified variable.
1793  * @param Tree    kd-tree to search in for nearest neighbor
1794  * @param Cluster cluster whose nearest neighbor is to be found
1795  * @param Distance  ptr to variable to report distance found
1796  * @return  Pointer to the nearest neighbor of Cluster, or nullptr
1797  */
FindNearestNeighbor(KDTREE * Tree,CLUSTER * Cluster,float * Distance)1798 static CLUSTER *FindNearestNeighbor(KDTREE *Tree, CLUSTER *Cluster, float *Distance)
1799 #define MAXNEIGHBORS 2
1800 #define MAXDISTANCE FLT_MAX
1801 {
1802   CLUSTER *Neighbor[MAXNEIGHBORS];
1803   float Dist[MAXNEIGHBORS];
1804   int NumberOfNeighbors;
1805   int32_t i;
1806   CLUSTER *BestNeighbor;
1807 
1808   // find the 2 nearest neighbors of the cluster
1809   KDNearestNeighborSearch(Tree, &Cluster->Mean[0], MAXNEIGHBORS, MAXDISTANCE, &NumberOfNeighbors,
1810                           reinterpret_cast<void **>(Neighbor), Dist);
1811 
1812   // search for the nearest neighbor that is not the cluster itself
1813   *Distance = MAXDISTANCE;
1814   BestNeighbor = nullptr;
1815   for (i = 0; i < NumberOfNeighbors; i++) {
1816     if ((Dist[i] < *Distance) && (Neighbor[i] != Cluster)) {
1817       *Distance = Dist[i];
1818       BestNeighbor = Neighbor[i];
1819     }
1820   }
1821   return BestNeighbor;
1822 } // FindNearestNeighbor
1823 
1824 /**
1825  * This routine creates a new permanent cluster from the
1826  * clusters specified in TempCluster.  The 2 clusters in
1827  * TempCluster are marked as "clustered" and deleted from
1828  * the kd-tree.  The new cluster is then added to the kd-tree.
1829  * @param Clusterer current clustering environment
1830  * @param TempCluster potential cluster to make permanent
1831  * @return Pointer to the new permanent cluster
1832  */
MakeNewCluster(CLUSTERER * Clusterer,TEMPCLUSTER * TempCluster)1833 static CLUSTER *MakeNewCluster(CLUSTERER *Clusterer, TEMPCLUSTER *TempCluster) {
1834   // allocate the new cluster and initialize it
1835   auto Cluster = new CLUSTER(Clusterer->SampleSize);
1836   Cluster->Clustered = false;
1837   Cluster->Prototype = false;
1838   Cluster->Left = TempCluster->Cluster;
1839   Cluster->Right = TempCluster->Neighbor;
1840   Cluster->CharID = -1;
1841 
1842   // mark the old clusters as "clustered" and delete them from the kd-tree
1843   Cluster->Left->Clustered = true;
1844   Cluster->Right->Clustered = true;
1845   KDDelete(Clusterer->KDTree, &Cluster->Left->Mean[0], Cluster->Left);
1846   KDDelete(Clusterer->KDTree, &Cluster->Right->Mean[0], Cluster->Right);
1847 
1848   // compute the mean and sample count for the new cluster
1849   Cluster->SampleCount = MergeClusters(Clusterer->SampleSize, Clusterer->ParamDesc,
1850                                        Cluster->Left->SampleCount, Cluster->Right->SampleCount,
1851                                        &Cluster->Mean[0], &Cluster->Left->Mean[0], &Cluster->Right->Mean[0]);
1852 
1853   // add the new cluster to the KD tree
1854   KDStore(Clusterer->KDTree, &Cluster->Mean[0], Cluster);
1855   return Cluster;
1856 } // MakeNewCluster
1857 
1858 /**
1859  * This routine merges two clusters into one larger cluster.
1860  * To do this it computes the number of samples in the new
1861  * cluster and the mean of the new cluster.  The ParamDesc
1862  * information is used to ensure that circular dimensions
1863  * are handled correctly.
1864  * @param N # of dimensions (size of arrays)
1865  * @param ParamDesc array of dimension descriptions
1866  * @param n1, n2  number of samples in each old cluster
1867  * @param m array to hold mean of new cluster
1868  * @param m1, m2  arrays containing means of old clusters
1869  * @return  The number of samples in the new cluster.
1870  */
MergeClusters(int16_t N,PARAM_DESC ParamDesc[],int32_t n1,int32_t n2,float m[],float m1[],float m2[])1871 int32_t MergeClusters(int16_t N, PARAM_DESC ParamDesc[], int32_t n1, int32_t n2, float m[],
1872                       float m1[], float m2[]) {
1873   int32_t i, n;
1874 
1875   n = n1 + n2;
1876   for (i = N; i > 0; i--, ParamDesc++, m++, m1++, m2++) {
1877     if (ParamDesc->Circular) {
1878       // if distance between means is greater than allowed
1879       // reduce upper point by one "rotation" to compute mean
1880       // then normalize the mean back into the accepted range
1881       if ((*m2 - *m1) > ParamDesc->HalfRange) {
1882         *m = (n1 * *m1 + n2 * (*m2 - ParamDesc->Range)) / n;
1883         if (*m < ParamDesc->Min) {
1884           *m += ParamDesc->Range;
1885         }
1886       } else if ((*m1 - *m2) > ParamDesc->HalfRange) {
1887         *m = (n1 * (*m1 - ParamDesc->Range) + n2 * *m2) / n;
1888         if (*m < ParamDesc->Min) {
1889           *m += ParamDesc->Range;
1890         }
1891       } else {
1892         *m = (n1 * *m1 + n2 * *m2) / n;
1893       }
1894     } else {
1895       *m = (n1 * *m1 + n2 * *m2) / n;
1896     }
1897   }
1898   return n;
1899 } // MergeClusters
1900 
1901 /**
1902  * This routine decides which clusters in the cluster tree
1903  * should be represented by prototypes, forms a list of these
1904  * prototypes, and places the list in the Clusterer data
1905  * structure.
1906  * @param Clusterer data structure holding cluster tree
1907  * @param Config    parameters used to control prototype generation
1908  */
ComputePrototypes(CLUSTERER * Clusterer,CLUSTERCONFIG * Config)1909 static void ComputePrototypes(CLUSTERER *Clusterer, CLUSTERCONFIG *Config) {
1910   LIST ClusterStack = NIL_LIST;
1911   CLUSTER *Cluster;
1912   PROTOTYPE *Prototype;
1913 
1914   // use a stack to keep track of clusters waiting to be processed
1915   // initially the only cluster on the stack is the root cluster
1916   if (Clusterer->Root != nullptr) {
1917     ClusterStack = push(NIL_LIST, Clusterer->Root);
1918   }
1919 
1920   // loop until we have analyzed all clusters which are potential prototypes
1921   while (ClusterStack != NIL_LIST) {
1922     // remove the next cluster to be analyzed from the stack
1923     // try to make a prototype from the cluster
1924     // if successful, put it on the proto list, else split the cluster
1925     Cluster = reinterpret_cast<CLUSTER *>(ClusterStack->first_node());
1926     ClusterStack = pop(ClusterStack);
1927     Prototype = MakePrototype(Clusterer, Config, Cluster);
1928     if (Prototype != nullptr) {
1929       Clusterer->ProtoList = push(Clusterer->ProtoList, Prototype);
1930     } else {
1931       ClusterStack = push(ClusterStack, Cluster->Right);
1932       ClusterStack = push(ClusterStack, Cluster->Left);
1933     }
1934   }
1935 } // ComputePrototypes
1936 
1937 /**
1938  * This routine attempts to create a prototype from the
1939  * specified cluster that conforms to the distribution
1940  * specified in Config.  If there are too few samples in the
1941  * cluster to perform a statistical analysis, then a prototype
1942  * is generated but labelled as insignificant.  If the
1943  * dimensions of the cluster are not independent, no prototype
1944  * is generated and nullptr is returned.  If a prototype can be
1945  * found that matches the desired distribution then a pointer
1946  * to it is returned, otherwise nullptr is returned.
1947  * @param Clusterer data structure holding cluster tree
1948  * @param Config  parameters used to control prototype generation
1949  * @param Cluster cluster to be made into a prototype
1950  * @return  Pointer to new prototype or nullptr
1951  */
MakePrototype(CLUSTERER * Clusterer,CLUSTERCONFIG * Config,CLUSTER * Cluster)1952 static PROTOTYPE *MakePrototype(CLUSTERER *Clusterer, CLUSTERCONFIG *Config, CLUSTER *Cluster) {
1953   PROTOTYPE *Proto;
1954   BUCKETS *Buckets;
1955 
1956   // filter out clusters which contain samples from the same character
1957   if (MultipleCharSamples(Clusterer, Cluster, Config->MaxIllegal)) {
1958     return nullptr;
1959   }
1960 
1961   // compute the covariance matrix and ranges for the cluster
1962   auto Statistics = ComputeStatistics(Clusterer->SampleSize, Clusterer->ParamDesc, Cluster);
1963 
1964   // check for degenerate clusters which need not be analyzed further
1965   // note that the MinSamples test assumes that all clusters with multiple
1966   // character samples have been removed (as above)
1967   Proto = MakeDegenerateProto(Clusterer->SampleSize, Cluster, Statistics, Config->ProtoStyle,
1968                               static_cast<int32_t>(Config->MinSamples * Clusterer->NumChar));
1969   if (Proto != nullptr) {
1970     delete Statistics;
1971     return Proto;
1972   }
1973   // check to ensure that all dimensions are independent
1974   if (!Independent(Clusterer->ParamDesc, Clusterer->SampleSize, &Statistics->CoVariance[0],
1975                    Config->Independence)) {
1976     delete Statistics;
1977     return nullptr;
1978   }
1979 
1980   if (HOTELLING && Config->ProtoStyle == elliptical) {
1981     Proto = TestEllipticalProto(Clusterer, Config, Cluster, Statistics);
1982     if (Proto != nullptr) {
1983       delete Statistics;
1984       return Proto;
1985     }
1986   }
1987 
1988   // create a histogram data structure used to evaluate distributions
1989   Buckets = GetBuckets(Clusterer, normal, Cluster->SampleCount, Config->Confidence);
1990 
1991   // create a prototype based on the statistics and test it
1992   switch (Config->ProtoStyle) {
1993     case spherical:
1994       Proto = MakeSphericalProto(Clusterer, Cluster, Statistics, Buckets);
1995       break;
1996     case elliptical:
1997       Proto = MakeEllipticalProto(Clusterer, Cluster, Statistics, Buckets);
1998       break;
1999     case mixed:
2000       Proto = MakeMixedProto(Clusterer, Cluster, Statistics, Buckets, Config->Confidence);
2001       break;
2002     case automatic:
2003       Proto = MakeSphericalProto(Clusterer, Cluster, Statistics, Buckets);
2004       if (Proto != nullptr) {
2005         break;
2006       }
2007       Proto = MakeEllipticalProto(Clusterer, Cluster, Statistics, Buckets);
2008       if (Proto != nullptr) {
2009         break;
2010       }
2011       Proto = MakeMixedProto(Clusterer, Cluster, Statistics, Buckets, Config->Confidence);
2012       break;
2013   }
2014   delete Statistics;
2015   return Proto;
2016 } // MakePrototype
2017 
2018 /**
2019  * This routine checks for clusters which are degenerate and
2020  * therefore cannot be analyzed in a statistically valid way.
2021  * A cluster is defined as degenerate if it does not have at
2022  * least MINSAMPLESNEEDED samples in it.  If the cluster is
2023  * found to be degenerate, a prototype of the specified style
2024  * is generated and marked as insignificant.  A cluster is
2025  * also degenerate if it does not have at least MinSamples
2026  * samples in it.
2027  *
2028  * If the cluster is not degenerate, nullptr is returned.
2029  *
2030  * @param N   number of dimensions
2031  * @param Cluster   cluster being analyzed
2032  * @param Statistics  statistical info about cluster
2033  * @param Style   type of prototype to be generated
2034  * @param MinSamples  minimum number of samples in a cluster
2035  * @return  Pointer to degenerate prototype or nullptr.
2036  */
MakeDegenerateProto(uint16_t N,CLUSTER * Cluster,STATISTICS * Statistics,PROTOSTYLE Style,int32_t MinSamples)2037 static PROTOTYPE *MakeDegenerateProto( // this was MinSample
2038     uint16_t N, CLUSTER *Cluster, STATISTICS *Statistics, PROTOSTYLE Style, int32_t MinSamples) {
2039   PROTOTYPE *Proto = nullptr;
2040 
2041   if (MinSamples < MINSAMPLESNEEDED) {
2042     MinSamples = MINSAMPLESNEEDED;
2043   }
2044 
2045   if (Cluster->SampleCount < MinSamples) {
2046     switch (Style) {
2047       case spherical:
2048         Proto = NewSphericalProto(N, Cluster, Statistics);
2049         break;
2050       case elliptical:
2051       case automatic:
2052         Proto = NewEllipticalProto(N, Cluster, Statistics);
2053         break;
2054       case mixed:
2055         Proto = NewMixedProto(N, Cluster, Statistics);
2056         break;
2057     }
2058     Proto->Significant = false;
2059   }
2060   return (Proto);
2061 } // MakeDegenerateProto
2062 
2063 /**
2064  * This routine tests the specified cluster to see if **
2065  * there is a statistically significant difference between
2066  * the sub-clusters that would be made if the cluster were to
2067  * be split. If not, then a new prototype is formed and
2068  * returned to the caller. If there is, then nullptr is returned
2069  * to the caller.
2070  * @param Clusterer data struct containing samples being clustered
2071  * @param Config provides the magic number of samples that make a good cluster
2072  * @param Cluster   cluster to be made into an elliptical prototype
2073  * @param Statistics  statistical info about cluster
2074  * @return Pointer to new elliptical prototype or nullptr.
2075  */
TestEllipticalProto(CLUSTERER * Clusterer,CLUSTERCONFIG * Config,CLUSTER * Cluster,STATISTICS * Statistics)2076 static PROTOTYPE *TestEllipticalProto(CLUSTERER *Clusterer, CLUSTERCONFIG *Config, CLUSTER *Cluster,
2077                                       STATISTICS *Statistics) {
2078   // Fraction of the number of samples used as a range around 1 within
2079   // which a cluster has the magic size that allows a boost to the
2080   // FTable by kFTableBoostMargin, thus allowing clusters near the
2081   // magic size (equal to the number of sample characters) to be more
2082   // likely to stay together.
2083   const double kMagicSampleMargin = 0.0625;
2084   const double kFTableBoostMargin = 2.0;
2085 
2086   int N = Clusterer->SampleSize;
2087   CLUSTER *Left = Cluster->Left;
2088   CLUSTER *Right = Cluster->Right;
2089   if (Left == nullptr || Right == nullptr) {
2090     return nullptr;
2091   }
2092   int TotalDims = Left->SampleCount + Right->SampleCount;
2093   if (TotalDims < N + 1 || TotalDims < 2) {
2094     return nullptr;
2095   }
2096   std::vector<float> Covariance(static_cast<size_t>(N) * N);
2097   std::vector<float> Inverse(static_cast<size_t>(N) * N);
2098   std::vector<float> Delta(N);
2099   // Compute a new covariance matrix that only uses essential features.
2100   for (int i = 0; i < N; ++i) {
2101     int row_offset = i * N;
2102     if (!Clusterer->ParamDesc[i].NonEssential) {
2103       for (int j = 0; j < N; ++j) {
2104         if (!Clusterer->ParamDesc[j].NonEssential) {
2105           Covariance[j + row_offset] = Statistics->CoVariance[j + row_offset];
2106         } else {
2107           Covariance[j + row_offset] = 0.0f;
2108         }
2109       }
2110     } else {
2111       for (int j = 0; j < N; ++j) {
2112         if (i == j) {
2113           Covariance[j + row_offset] = 1.0f;
2114         } else {
2115           Covariance[j + row_offset] = 0.0f;
2116         }
2117       }
2118     }
2119   }
2120   double err = InvertMatrix(&Covariance[0], N, &Inverse[0]);
2121   if (err > 1) {
2122     tprintf("Clustering error: Matrix inverse failed with error %g\n", err);
2123   }
2124   int EssentialN = 0;
2125   for (int dim = 0; dim < N; ++dim) {
2126     if (!Clusterer->ParamDesc[dim].NonEssential) {
2127       Delta[dim] = Left->Mean[dim] - Right->Mean[dim];
2128       ++EssentialN;
2129     } else {
2130       Delta[dim] = 0.0f;
2131     }
2132   }
2133   // Compute Hotelling's T-squared.
2134   double Tsq = 0.0;
2135   for (int x = 0; x < N; ++x) {
2136     double temp = 0.0;
2137     for (int y = 0; y < N; ++y) {
2138       temp += static_cast<double>(Inverse[y + N * x]) * Delta[y];
2139     }
2140     Tsq += Delta[x] * temp;
2141   }
2142   // Changed this function to match the formula in
2143   // Statistical Methods in Medical Research p 473
2144   // By Peter Armitage, Geoffrey Berry, J. N. S. Matthews.
2145   // Tsq *= Left->SampleCount * Right->SampleCount / TotalDims;
2146   double F = Tsq * (TotalDims - EssentialN - 1) / ((TotalDims - 2) * EssentialN);
2147   int Fx = EssentialN;
2148   if (Fx > FTABLE_X) {
2149     Fx = FTABLE_X;
2150   }
2151   --Fx;
2152   int Fy = TotalDims - EssentialN - 1;
2153   if (Fy > FTABLE_Y) {
2154     Fy = FTABLE_Y;
2155   }
2156   --Fy;
2157   double FTarget = FTable[Fy][Fx];
2158   if (Config->MagicSamples > 0 && TotalDims >= Config->MagicSamples * (1.0 - kMagicSampleMargin) &&
2159       TotalDims <= Config->MagicSamples * (1.0 + kMagicSampleMargin)) {
2160     // Give magic-sized clusters a magic FTable boost.
2161     FTarget += kFTableBoostMargin;
2162   }
2163   if (F < FTarget) {
2164     return NewEllipticalProto(Clusterer->SampleSize, Cluster, Statistics);
2165   }
2166   return nullptr;
2167 }
2168 
2169 /**
2170  * This routine tests the specified cluster to see if it can
2171  * be approximated by a spherical normal distribution.  If it
2172  * can be, then a new prototype is formed and returned to the
2173  * caller.  If it can't be, then nullptr is returned to the caller.
2174  * @param Clusterer data struct containing samples being clustered
2175  * @param Cluster   cluster to be made into a spherical prototype
2176  * @param Statistics  statistical info about cluster
2177  * @param Buckets   histogram struct used to analyze distribution
2178  * @return  Pointer to new spherical prototype or nullptr.
2179  */
MakeSphericalProto(CLUSTERER * Clusterer,CLUSTER * Cluster,STATISTICS * Statistics,BUCKETS * Buckets)2180 static PROTOTYPE *MakeSphericalProto(CLUSTERER *Clusterer, CLUSTER *Cluster, STATISTICS *Statistics,
2181                                      BUCKETS *Buckets) {
2182   PROTOTYPE *Proto = nullptr;
2183   int i;
2184 
2185   // check that each dimension is a normal distribution
2186   for (i = 0; i < Clusterer->SampleSize; i++) {
2187     if (Clusterer->ParamDesc[i].NonEssential) {
2188       continue;
2189     }
2190 
2191     FillBuckets(Buckets, Cluster, i, &(Clusterer->ParamDesc[i]), Cluster->Mean[i],
2192                 sqrt(static_cast<double>(Statistics->AvgVariance)));
2193     if (!DistributionOK(Buckets)) {
2194       break;
2195     }
2196   }
2197   // if all dimensions matched a normal distribution, make a proto
2198   if (i >= Clusterer->SampleSize) {
2199     Proto = NewSphericalProto(Clusterer->SampleSize, Cluster, Statistics);
2200   }
2201   return (Proto);
2202 } // MakeSphericalProto
2203 
2204 /**
2205  * This routine tests the specified cluster to see if it can
2206  * be approximated by an elliptical normal distribution.  If it
2207  * can be, then a new prototype is formed and returned to the
2208  * caller.  If it can't be, then nullptr is returned to the caller.
2209  * @param Clusterer data struct containing samples being clustered
2210  * @param Cluster   cluster to be made into an elliptical prototype
2211  * @param Statistics  statistical info about cluster
2212  * @param Buckets   histogram struct used to analyze distribution
2213  * @return  Pointer to new elliptical prototype or nullptr.
2214  */
MakeEllipticalProto(CLUSTERER * Clusterer,CLUSTER * Cluster,STATISTICS * Statistics,BUCKETS * Buckets)2215 static PROTOTYPE *MakeEllipticalProto(CLUSTERER *Clusterer, CLUSTER *Cluster,
2216                                       STATISTICS *Statistics, BUCKETS *Buckets) {
2217   PROTOTYPE *Proto = nullptr;
2218   int i;
2219 
2220   // check that each dimension is a normal distribution
2221   for (i = 0; i < Clusterer->SampleSize; i++) {
2222     if (Clusterer->ParamDesc[i].NonEssential) {
2223       continue;
2224     }
2225 
2226     FillBuckets(Buckets, Cluster, i, &(Clusterer->ParamDesc[i]), Cluster->Mean[i],
2227                 sqrt(static_cast<double>(Statistics->CoVariance[i * (Clusterer->SampleSize + 1)])));
2228     if (!DistributionOK(Buckets)) {
2229       break;
2230     }
2231   }
2232   // if all dimensions matched a normal distribution, make a proto
2233   if (i >= Clusterer->SampleSize) {
2234     Proto = NewEllipticalProto(Clusterer->SampleSize, Cluster, Statistics);
2235   }
2236   return (Proto);
2237 } // MakeEllipticalProto
2238 
2239 /**
2240  * This routine tests each dimension of the specified cluster to
2241  * see what distribution would best approximate that dimension.
2242  * Each dimension is compared to the following distributions
2243  * in order: normal, random, uniform.  If each dimension can
2244  * be represented by one of these distributions,
2245  * then a new prototype is formed and returned to the
2246  * caller.  If it can't be, then nullptr is returned to the caller.
2247  * @param Clusterer data struct containing samples being clustered
2248  * @param Cluster   cluster to be made into a prototype
2249  * @param Statistics  statistical info about cluster
2250  * @param NormalBuckets histogram struct used to analyze distribution
2251  * @param Confidence  confidence level for alternate distributions
2252  * @return  Pointer to new mixed prototype or nullptr.
2253  */
MakeMixedProto(CLUSTERER * Clusterer,CLUSTER * Cluster,STATISTICS * Statistics,BUCKETS * NormalBuckets,double Confidence)2254 static PROTOTYPE *MakeMixedProto(CLUSTERER *Clusterer, CLUSTER *Cluster, STATISTICS *Statistics,
2255                                  BUCKETS *NormalBuckets, double Confidence) {
2256   PROTOTYPE *Proto;
2257   int i;
2258   BUCKETS *UniformBuckets = nullptr;
2259   BUCKETS *RandomBuckets = nullptr;
2260 
2261   // create a mixed proto to work on - initially assume all dimensions normal
2262   Proto = NewMixedProto(Clusterer->SampleSize, Cluster, Statistics);
2263 
2264   // find the proper distribution for each dimension
2265   for (i = 0; i < Clusterer->SampleSize; i++) {
2266     if (Clusterer->ParamDesc[i].NonEssential) {
2267       continue;
2268     }
2269 
2270     FillBuckets(NormalBuckets, Cluster, i, &(Clusterer->ParamDesc[i]), Proto->Mean[i],
2271                 std::sqrt(Proto->Variance.Elliptical[i]));
2272     if (DistributionOK(NormalBuckets)) {
2273       continue;
2274     }
2275 
2276     if (RandomBuckets == nullptr) {
2277       RandomBuckets = GetBuckets(Clusterer, D_random, Cluster->SampleCount, Confidence);
2278     }
2279     MakeDimRandom(i, Proto, &(Clusterer->ParamDesc[i]));
2280     FillBuckets(RandomBuckets, Cluster, i, &(Clusterer->ParamDesc[i]), Proto->Mean[i],
2281                 Proto->Variance.Elliptical[i]);
2282     if (DistributionOK(RandomBuckets)) {
2283       continue;
2284     }
2285 
2286     if (UniformBuckets == nullptr) {
2287       UniformBuckets = GetBuckets(Clusterer, uniform, Cluster->SampleCount, Confidence);
2288     }
2289     MakeDimUniform(i, Proto, Statistics);
2290     FillBuckets(UniformBuckets, Cluster, i, &(Clusterer->ParamDesc[i]), Proto->Mean[i],
2291                 Proto->Variance.Elliptical[i]);
2292     if (DistributionOK(UniformBuckets)) {
2293       continue;
2294     }
2295     break;
2296   }
2297   // if any dimension failed to match a distribution, discard the proto
2298   if (i < Clusterer->SampleSize) {
2299     FreePrototype(Proto);
2300     Proto = nullptr;
2301   }
2302   return (Proto);
2303 } // MakeMixedProto
2304 
2305 /**
2306  * This routine alters the ith dimension of the specified
2307  * mixed prototype to be D_random.
2308  * @param i index of dimension to be changed
2309  * @param Proto prototype whose dimension is to be altered
2310  * @param ParamDesc description of specified dimension
2311  */
MakeDimRandom(uint16_t i,PROTOTYPE * Proto,PARAM_DESC * ParamDesc)2312 static void MakeDimRandom(uint16_t i, PROTOTYPE *Proto, PARAM_DESC *ParamDesc) {
2313   Proto->Distrib[i] = D_random;
2314   Proto->Mean[i] = ParamDesc->MidRange;
2315   Proto->Variance.Elliptical[i] = ParamDesc->HalfRange;
2316 
2317   // subtract out the previous magnitude of this dimension from the total
2318   Proto->TotalMagnitude /= Proto->Magnitude.Elliptical[i];
2319   Proto->Magnitude.Elliptical[i] = 1.0 / ParamDesc->Range;
2320   Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i];
2321   Proto->LogMagnitude = log(static_cast<double>(Proto->TotalMagnitude));
2322 
2323   // note that the proto Weight is irrelevant for D_random protos
2324 } // MakeDimRandom
2325 
2326 /**
2327  * This routine alters the ith dimension of the specified
2328  * mixed prototype to be uniform.
2329  * @param i index of dimension to be changed
2330  * @param Proto   prototype whose dimension is to be altered
2331  * @param Statistics  statistical info about prototype
2332  */
MakeDimUniform(uint16_t i,PROTOTYPE * Proto,STATISTICS * Statistics)2333 static void MakeDimUniform(uint16_t i, PROTOTYPE *Proto, STATISTICS *Statistics) {
2334   Proto->Distrib[i] = uniform;
2335   Proto->Mean[i] = Proto->Cluster->Mean[i] + (Statistics->Min[i] + Statistics->Max[i]) / 2;
2336   Proto->Variance.Elliptical[i] = (Statistics->Max[i] - Statistics->Min[i]) / 2;
2337   if (Proto->Variance.Elliptical[i] < MINVARIANCE) {
2338     Proto->Variance.Elliptical[i] = MINVARIANCE;
2339   }
2340 
2341   // subtract out the previous magnitude of this dimension from the total
2342   Proto->TotalMagnitude /= Proto->Magnitude.Elliptical[i];
2343   Proto->Magnitude.Elliptical[i] = 1.0 / (2.0 * Proto->Variance.Elliptical[i]);
2344   Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i];
2345   Proto->LogMagnitude = log(static_cast<double>(Proto->TotalMagnitude));
2346 
2347   // note that the proto Weight is irrelevant for uniform protos
2348 } // MakeDimUniform
2349 
2350 /**
2351  * This routine searches the cluster tree for all leaf nodes
2352  * which are samples in the specified cluster.  It computes
2353  * a full covariance matrix for these samples as well as
2354  * keeping track of the ranges (min and max) for each
2355  * dimension.  A special data structure is allocated to
2356  * return this information to the caller.  An incremental
2357  * algorithm for computing statistics is not used because
2358  * it will not work with circular dimensions.
2359  * @param N number of dimensions
2360  * @param ParamDesc array of dimension descriptions
2361  * @param Cluster cluster whose stats are to be computed
2362  * @return  Pointer to new data structure containing statistics
2363  */
ComputeStatistics(int16_t N,PARAM_DESC ParamDesc[],CLUSTER * Cluster)2364 static STATISTICS *ComputeStatistics(int16_t N, PARAM_DESC ParamDesc[], CLUSTER *Cluster) {
2365   int i, j;
2366   LIST SearchState;
2367   SAMPLE *Sample;
2368   uint32_t SampleCountAdjustedForBias;
2369 
2370   // allocate memory to hold the statistics results
2371   auto Statistics = new STATISTICS(N);
2372 
2373   // allocate temporary memory to hold the sample to mean distances
2374   std::vector<float> Distance(N);
2375 
2376   // find each sample in the cluster and merge it into the statistics
2377   InitSampleSearch(SearchState, Cluster);
2378   while ((Sample = NextSample(&SearchState)) != nullptr) {
2379     for (i = 0; i < N; i++) {
2380       Distance[i] = Sample->Mean[i] - Cluster->Mean[i];
2381       if (ParamDesc[i].Circular) {
2382         if (Distance[i] > ParamDesc[i].HalfRange) {
2383           Distance[i] -= ParamDesc[i].Range;
2384         }
2385         if (Distance[i] < -ParamDesc[i].HalfRange) {
2386           Distance[i] += ParamDesc[i].Range;
2387         }
2388       }
2389       if (Distance[i] < Statistics->Min[i]) {
2390         Statistics->Min[i] = Distance[i];
2391       }
2392       if (Distance[i] > Statistics->Max[i]) {
2393         Statistics->Max[i] = Distance[i];
2394       }
2395     }
2396     auto CoVariance = &Statistics->CoVariance[0];
2397     for (i = 0; i < N; i++) {
2398       for (j = 0; j < N; j++, CoVariance++) {
2399         *CoVariance += Distance[i] * Distance[j];
2400       }
2401     }
2402   }
2403   // normalize the variances by the total number of samples
2404   // use SampleCount-1 instead of SampleCount to get an unbiased estimate
2405   // also compute the geometic mean of the diagonal variances
2406   // ensure that clusters with only 1 sample are handled correctly
2407   if (Cluster->SampleCount > 1) {
2408     SampleCountAdjustedForBias = Cluster->SampleCount - 1;
2409   } else {
2410     SampleCountAdjustedForBias = 1;
2411   }
2412   auto CoVariance = &Statistics->CoVariance[0];
2413   for (i = 0; i < N; i++) {
2414     for (j = 0; j < N; j++, CoVariance++) {
2415       *CoVariance /= SampleCountAdjustedForBias;
2416       if (j == i) {
2417         if (*CoVariance < MINVARIANCE) {
2418           *CoVariance = MINVARIANCE;
2419         }
2420         Statistics->AvgVariance *= *CoVariance;
2421       }
2422     }
2423   }
2424   Statistics->AvgVariance =
2425       static_cast<float>(pow(static_cast<double>(Statistics->AvgVariance), 1.0 / N));
2426 
2427   return Statistics;
2428 } // ComputeStatistics
2429 
2430 /**
2431  * This routine creates a spherical prototype data structure to
2432  * approximate the samples in the specified cluster.
2433  * Spherical prototypes have a single variance which is
2434  * common across all dimensions.  All dimensions are normally
2435  * distributed and independent.
2436  * @param N number of dimensions
2437  * @param Cluster cluster to be made into a spherical prototype
2438  * @param Statistics  statistical info about samples in cluster
2439  * @return  Pointer to a new spherical prototype data structure
2440  */
NewSphericalProto(uint16_t N,CLUSTER * Cluster,STATISTICS * Statistics)2441 static PROTOTYPE *NewSphericalProto(uint16_t N, CLUSTER *Cluster, STATISTICS *Statistics) {
2442   PROTOTYPE *Proto;
2443 
2444   Proto = NewSimpleProto(N, Cluster);
2445 
2446   Proto->Variance.Spherical = Statistics->AvgVariance;
2447   if (Proto->Variance.Spherical < MINVARIANCE) {
2448     Proto->Variance.Spherical = MINVARIANCE;
2449   }
2450 
2451   Proto->Magnitude.Spherical = 1.0 / sqrt(2.0 * M_PI * Proto->Variance.Spherical);
2452   Proto->TotalMagnitude = static_cast<float>(
2453       pow(static_cast<double>(Proto->Magnitude.Spherical), static_cast<double>(N)));
2454   Proto->Weight.Spherical = 1.0 / Proto->Variance.Spherical;
2455   Proto->LogMagnitude = log(static_cast<double>(Proto->TotalMagnitude));
2456 
2457   return (Proto);
2458 } // NewSphericalProto
2459 
2460 /**
2461  * This routine creates an elliptical prototype data structure to
2462  * approximate the samples in the specified cluster.
2463  * Elliptical prototypes have a variance for each dimension.
2464  * All dimensions are normally distributed and independent.
2465  * @param N number of dimensions
2466  * @param Cluster cluster to be made into an elliptical prototype
2467  * @param Statistics  statistical info about samples in cluster
2468  * @return  Pointer to a new elliptical prototype data structure
2469  */
NewEllipticalProto(int16_t N,CLUSTER * Cluster,STATISTICS * Statistics)2470 static PROTOTYPE *NewEllipticalProto(int16_t N, CLUSTER *Cluster, STATISTICS *Statistics) {
2471   PROTOTYPE *Proto;
2472   int i;
2473 
2474   Proto = NewSimpleProto(N, Cluster);
2475   Proto->Variance.Elliptical = new float[N];
2476   Proto->Magnitude.Elliptical = new float[N];
2477   Proto->Weight.Elliptical = new float[N];
2478 
2479   auto CoVariance = &Statistics->CoVariance[0];
2480   Proto->TotalMagnitude = 1.0;
2481   for (i = 0; i < N; i++, CoVariance += N + 1) {
2482     Proto->Variance.Elliptical[i] = *CoVariance;
2483     if (Proto->Variance.Elliptical[i] < MINVARIANCE) {
2484       Proto->Variance.Elliptical[i] = MINVARIANCE;
2485     }
2486 
2487     Proto->Magnitude.Elliptical[i] = 1.0f / sqrt(2.0f * M_PI * Proto->Variance.Elliptical[i]);
2488     Proto->Weight.Elliptical[i] = 1.0f / Proto->Variance.Elliptical[i];
2489     Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i];
2490   }
2491   Proto->LogMagnitude = log(static_cast<double>(Proto->TotalMagnitude));
2492   Proto->Style = elliptical;
2493   return (Proto);
2494 } // NewEllipticalProto
2495 
2496 /**
2497  * This routine creates a mixed prototype data structure to
2498  * approximate the samples in the specified cluster.
2499  * Mixed prototypes can have different distributions for
2500  * each dimension.  All dimensions are independent.  The
2501  * structure is initially filled in as though it were an
2502  * elliptical prototype.  The actual distributions of the
2503  * dimensions can be altered by other routines.
2504  * @param N number of dimensions
2505  * @param Cluster cluster to be made into a mixed prototype
2506  * @param Statistics  statistical info about samples in cluster
2507  * @return  Pointer to a new mixed prototype data structure
2508  */
NewMixedProto(int16_t N,CLUSTER * Cluster,STATISTICS * Statistics)2509 static PROTOTYPE *NewMixedProto(int16_t N, CLUSTER *Cluster, STATISTICS *Statistics) {
2510   auto Proto = NewEllipticalProto(N, Cluster, Statistics);
2511   Proto->Distrib.clear();
2512   Proto->Distrib.resize(N, normal);
2513   Proto->Style = mixed;
2514   return Proto;
2515 } // NewMixedProto
2516 
2517 /**
2518  * This routine allocates memory to hold a simple prototype
2519  * data structure, i.e. one without independent distributions
2520  * and variances for each dimension.
2521  * @param N number of dimensions
2522  * @param Cluster cluster to be made into a prototype
2523  * @return  Pointer to new simple prototype
2524  */
NewSimpleProto(int16_t N,CLUSTER * Cluster)2525 static PROTOTYPE *NewSimpleProto(int16_t N, CLUSTER *Cluster) {
2526   auto Proto = new PROTOTYPE;
2527   ASSERT_HOST(N == sizeof(Cluster->Mean));
2528   Proto->Mean = Cluster->Mean;
2529   Proto->Distrib.clear();
2530   Proto->Significant = true;
2531   Proto->Merged = false;
2532   Proto->Style = spherical;
2533   Proto->NumSamples = Cluster->SampleCount;
2534   Proto->Cluster = Cluster;
2535   Proto->Cluster->Prototype = true;
2536   return Proto;
2537 } // NewSimpleProto
2538 
2539 /**
2540  * This routine returns true if the specified covariance
2541  * matrix indicates that all N dimensions are independent of
2542  * one another.  One dimension is judged to be independent of
2543  * another when the magnitude of the corresponding correlation
2544  * coefficient is
2545  * less than the specified Independence factor.  The
2546  * correlation coefficient is calculated as: (see Duda and
2547  * Hart, pg. 247)
2548  * coeff[ij] = stddev[ij] / sqrt (stddev[ii] * stddev[jj])
2549  * The covariance matrix is assumed to be symmetric (which
2550  * should always be true).
2551  * @param ParamDesc descriptions of each feature space dimension
2552  * @param N number of dimensions
2553  * @param CoVariance  ptr to a covariance matrix
2554  * @param Independence  max off-diagonal correlation coefficient
2555  * @return true if dimensions are independent, false otherwise
2556  */
Independent(PARAM_DESC * ParamDesc,int16_t N,float * CoVariance,float Independence)2557 static bool Independent(PARAM_DESC *ParamDesc, int16_t N, float *CoVariance, float Independence) {
2558   int i, j;
2559   float *VARii; // points to ith on-diagonal element
2560   float *VARjj; // points to jth on-diagonal element
2561   float CorrelationCoeff;
2562 
2563   VARii = CoVariance;
2564   for (i = 0; i < N; i++, VARii += N + 1) {
2565     if (ParamDesc[i].NonEssential) {
2566       continue;
2567     }
2568 
2569     VARjj = VARii + N + 1;
2570     CoVariance = VARii + 1;
2571     for (j = i + 1; j < N; j++, CoVariance++, VARjj += N + 1) {
2572       if (ParamDesc[j].NonEssential) {
2573         continue;
2574       }
2575 
2576       if ((*VARii == 0.0) || (*VARjj == 0.0)) {
2577         CorrelationCoeff = 0.0;
2578       } else {
2579         CorrelationCoeff = sqrt(std::sqrt(*CoVariance * *CoVariance / (*VARii * *VARjj)));
2580       }
2581       if (CorrelationCoeff > Independence) {
2582         return false;
2583       }
2584     }
2585   }
2586   return true;
2587 } // Independent
2588 
2589 /**
2590  * This routine returns a histogram data structure which can
2591  * be used by other routines to place samples into histogram
2592  * buckets, and then apply a goodness of fit test to the
2593  * histogram data to determine if the samples belong to the
2594  * specified probability distribution.  The routine keeps
2595  * a list of bucket data structures which have already been
2596  * created so that it minimizes the computation time needed
2597  * to create a new bucket.
2598  * @param clusterer  which keeps a bucket_cache for us.
2599  * @param Distribution  type of probability distribution to test for
2600  * @param SampleCount number of samples that are available
2601  * @param Confidence  probability of a Type I error
2602  * @return  Bucket data structure
2603  */
GetBuckets(CLUSTERER * clusterer,DISTRIBUTION Distribution,uint32_t SampleCount,double Confidence)2604 static BUCKETS *GetBuckets(CLUSTERER *clusterer, DISTRIBUTION Distribution, uint32_t SampleCount,
2605                            double Confidence) {
2606   // Get an old bucket structure with the same number of buckets.
2607   uint16_t NumberOfBuckets = OptimumNumberOfBuckets(SampleCount);
2608   BUCKETS *Buckets = clusterer->bucket_cache[Distribution][NumberOfBuckets - MINBUCKETS];
2609 
2610   // If a matching bucket structure is not found, make one and save it.
2611   if (Buckets == nullptr) {
2612     Buckets = MakeBuckets(Distribution, SampleCount, Confidence);
2613     clusterer->bucket_cache[Distribution][NumberOfBuckets - MINBUCKETS] = Buckets;
2614   } else {
2615     // Just adjust the existing buckets.
2616     if (SampleCount != Buckets->SampleCount) {
2617       AdjustBuckets(Buckets, SampleCount);
2618     }
2619     if (Confidence != Buckets->Confidence) {
2620       Buckets->Confidence = Confidence;
2621       Buckets->ChiSquared =
2622           ComputeChiSquared(DegreesOfFreedom(Distribution, Buckets->NumberOfBuckets), Confidence);
2623     }
2624     InitBuckets(Buckets);
2625   }
2626   return Buckets;
2627 } // GetBuckets
2628 
2629 /**
2630  * This routine creates a histogram data structure which can
2631  * be used by other routines to place samples into histogram
2632  * buckets, and then apply a goodness of fit test to the
2633  * histogram data to determine if the samples belong to the
2634  * specified probability distribution.  The buckets are
2635  * allocated in such a way that the expected frequency of
2636  * samples in each bucket is approximately the same.  In
2637  * order to make this possible, a mapping table is
2638  * computed which maps "normalized" samples into the
2639  * appropriate bucket.
2640  * @param Distribution  type of probability distribution to test for
2641  * @param SampleCount number of samples that are available
2642  * @param Confidence  probability of a Type I error
2643  * @return Pointer to new histogram data structure
2644  */
MakeBuckets(DISTRIBUTION Distribution,uint32_t SampleCount,double Confidence)2645 static BUCKETS *MakeBuckets(DISTRIBUTION Distribution, uint32_t SampleCount, double Confidence) {
2646   const DENSITYFUNC DensityFunction[] = {NormalDensity, UniformDensity, UniformDensity};
2647   int i, j;
2648   double BucketProbability;
2649   double NextBucketBoundary;
2650   double Probability;
2651   double ProbabilityDelta;
2652   double LastProbDensity;
2653   double ProbDensity;
2654   uint16_t CurrentBucket;
2655   bool Symmetrical;
2656 
2657   // allocate memory needed for data structure
2658   auto Buckets = new BUCKETS(OptimumNumberOfBuckets(SampleCount));
2659   Buckets->SampleCount = SampleCount;
2660   Buckets->Confidence = Confidence;
2661 
2662   // initialize simple fields
2663   Buckets->Distribution = Distribution;
2664 
2665   // all currently defined distributions are symmetrical
2666   Symmetrical = true;
2667   Buckets->ChiSquared =
2668       ComputeChiSquared(DegreesOfFreedom(Distribution, Buckets->NumberOfBuckets), Confidence);
2669 
2670   if (Symmetrical) {
2671     // allocate buckets so that all have approx. equal probability
2672     BucketProbability = 1.0 / static_cast<double>(Buckets->NumberOfBuckets);
2673 
2674     // distribution is symmetric so fill in upper half then copy
2675     CurrentBucket = Buckets->NumberOfBuckets / 2;
2676     if (Odd(Buckets->NumberOfBuckets)) {
2677       NextBucketBoundary = BucketProbability / 2;
2678     } else {
2679       NextBucketBoundary = BucketProbability;
2680     }
2681 
2682     Probability = 0.0;
2683     LastProbDensity = (*DensityFunction[static_cast<int>(Distribution)])(BUCKETTABLESIZE / 2);
2684     for (i = BUCKETTABLESIZE / 2; i < BUCKETTABLESIZE; i++) {
2685       ProbDensity = (*DensityFunction[static_cast<int>(Distribution)])(i + 1);
2686       ProbabilityDelta = Integral(LastProbDensity, ProbDensity, 1.0);
2687       Probability += ProbabilityDelta;
2688       if (Probability > NextBucketBoundary) {
2689         if (CurrentBucket < Buckets->NumberOfBuckets - 1) {
2690           CurrentBucket++;
2691         }
2692         NextBucketBoundary += BucketProbability;
2693       }
2694       Buckets->Bucket[i] = CurrentBucket;
2695       Buckets->ExpectedCount[CurrentBucket] += static_cast<float>(ProbabilityDelta * SampleCount);
2696       LastProbDensity = ProbDensity;
2697     }
2698     // place any leftover probability into the last bucket
2699     Buckets->ExpectedCount[CurrentBucket] += static_cast<float>((0.5 - Probability) * SampleCount);
2700 
2701     // copy upper half of distribution to lower half
2702     for (i = 0, j = BUCKETTABLESIZE - 1; i < j; i++, j--) {
2703       Buckets->Bucket[i] = Mirror(Buckets->Bucket[j], Buckets->NumberOfBuckets);
2704     }
2705 
2706     // copy upper half of expected counts to lower half
2707     for (i = 0, j = Buckets->NumberOfBuckets - 1; i <= j; i++, j--) {
2708       Buckets->ExpectedCount[i] += Buckets->ExpectedCount[j];
2709     }
2710   }
2711   return Buckets;
2712 } // MakeBuckets
2713 
2714 /**
2715  * This routine computes the optimum number of histogram
2716  * buckets that should be used in a chi-squared goodness of
2717  * fit test for the specified number of samples.  The optimum
2718  * number is computed based on Table 4.1 on pg. 147 of
2719  * "Measurement and Analysis of Random Data" by Bendat & Piersol.
2720  * Linear interpolation is used to interpolate between table
2721  * values.  The table is intended for a 0.05 level of
2722  * significance (alpha).  This routine assumes that it is
2723  * equally valid for other alpha's, which may not be true.
2724  * @param SampleCount number of samples to be tested
2725  * @return Optimum number of histogram buckets
2726  */
OptimumNumberOfBuckets(uint32_t SampleCount)2727 static uint16_t OptimumNumberOfBuckets(uint32_t SampleCount) {
2728   uint8_t Last, Next;
2729   float Slope;
2730 
2731   if (SampleCount < kCountTable[0]) {
2732     return kBucketsTable[0];
2733   }
2734 
2735   for (Last = 0, Next = 1; Next < LOOKUPTABLESIZE; Last++, Next++) {
2736     if (SampleCount <= kCountTable[Next]) {
2737       Slope = static_cast<float>(kBucketsTable[Next] - kBucketsTable[Last]) /
2738               static_cast<float>(kCountTable[Next] - kCountTable[Last]);
2739       return (
2740           static_cast<uint16_t>(kBucketsTable[Last] + Slope * (SampleCount - kCountTable[Last])));
2741     }
2742   }
2743   return kBucketsTable[Last];
2744 } // OptimumNumberOfBuckets
2745 
2746 /**
2747  * This routine computes the chi-squared value which will
2748  * leave a cumulative probability of Alpha in the right tail
2749  * of a chi-squared distribution with the specified number of
2750  * degrees of freedom.  Alpha must be between 0 and 1.
2751  * DegreesOfFreedom must be even.  The routine maintains an
2752  * array of lists.  Each list corresponds to a different
2753  * number of degrees of freedom.  Each entry in the list
2754  * corresponds to a different alpha value and its corresponding
2755  * chi-squared value.  Therefore, once a particular chi-squared
2756  * value is computed, it is stored in the list and never
2757  * needs to be computed again.
2758  * @param DegreesOfFreedom  determines shape of distribution
2759  * @param Alpha probability of right tail
2760  * @return Desired chi-squared value
2761  */
ComputeChiSquared(uint16_t DegreesOfFreedom,double Alpha)2762 static double ComputeChiSquared(uint16_t DegreesOfFreedom, double Alpha)
2763 #define CHIACCURACY 0.01
2764 #define MINALPHA (1e-200)
2765 {
2766   static LIST ChiWith[MAXDEGREESOFFREEDOM + 1];
2767 
2768   // limit the minimum alpha that can be used - if alpha is too small
2769   //      it may not be possible to compute chi-squared.
2770   Alpha = ClipToRange(Alpha, MINALPHA, 1.0);
2771   if (Odd(DegreesOfFreedom)) {
2772     DegreesOfFreedom++;
2773   }
2774 
2775   /* find the list of chi-squared values which have already been computed
2776    for the specified number of degrees of freedom.  Search the list for
2777    the desired chi-squared. */
2778   CHISTRUCT SearchKey(0.0, Alpha);
2779   auto OldChiSquared = reinterpret_cast<CHISTRUCT *>(
2780       search(ChiWith[DegreesOfFreedom], &SearchKey, AlphaMatch)->first_node());
2781 
2782   if (OldChiSquared == nullptr) {
2783     OldChiSquared = new CHISTRUCT(DegreesOfFreedom, Alpha);
2784     OldChiSquared->ChiSquared =
2785         Solve(ChiArea, OldChiSquared, static_cast<double>(DegreesOfFreedom), CHIACCURACY);
2786     ChiWith[DegreesOfFreedom] = push(ChiWith[DegreesOfFreedom], OldChiSquared);
2787   } else {
2788     // further optimization might move OldChiSquared to front of list
2789   }
2790 
2791   return (OldChiSquared->ChiSquared);
2792 
2793 } // ComputeChiSquared
2794 
2795 /**
2796  * This routine computes the probability density function
2797  * of a discrete normal distribution defined by the global
2798  * variables kNormalMean, kNormalVariance, and kNormalMagnitude.
2799  * Normal magnitude could, of course, be computed in terms of
2800  * the normal variance but it is precomputed for efficiency.
2801  * @param x number to compute the normal probability density for
2802  * @note Globals:
2803  *    kNormalMean mean of a discrete normal distribution
2804  *    kNormalVariance variance of a discrete normal distribution
2805  *    kNormalMagnitude  magnitude of a discrete normal distribution
2806  * @return  The value of the normal distribution at x.
2807  */
NormalDensity(int32_t x)2808 static double NormalDensity(int32_t x) {
2809   double Distance;
2810 
2811   Distance = x - kNormalMean;
2812   return kNormalMagnitude * exp(-0.5 * Distance * Distance / kNormalVariance);
2813 } // NormalDensity
2814 
2815 /**
2816  * This routine computes the probability density function
2817  * of a uniform distribution at the specified point.  The
2818  * range of the distribution is from 0 to BUCKETTABLESIZE.
2819  * @param x number to compute the uniform probability density for
2820  * @return The value of the uniform distribution at x.
2821  */
UniformDensity(int32_t x)2822 static double UniformDensity(int32_t x) {
2823   constexpr auto UniformDistributionDensity = 1.0 / BUCKETTABLESIZE;
2824 
2825   if ((x >= 0) && (x <= BUCKETTABLESIZE)) {
2826     return UniformDistributionDensity;
2827   } else {
2828     return 0.0;
2829   }
2830 } // UniformDensity
2831 
2832 /**
2833  * This routine computes a trapezoidal approximation to the
2834  * integral of a function over a small delta in x.
2835  * @param f1  value of function at x1
2836  * @param f2  value of function at x2
2837  * @param Dx  x2 - x1 (should always be positive)
2838  * @return Approximation of the integral of the function from x1 to x2.
2839  */
Integral(double f1,double f2,double Dx)2840 static double Integral(double f1, double f2, double Dx) {
2841   return (f1 + f2) * Dx / 2.0;
2842 } // Integral
2843 
2844 /**
2845  * This routine counts the number of cluster samples which
2846  * fall within the various histogram buckets in Buckets.  Only
2847  * one dimension of each sample is examined.  The exact meaning
2848  * of the Mean and StdDev parameters depends on the
2849  * distribution which is being analyzed (this info is in the
2850  * Buckets data structure).  For normal distributions, Mean
2851  * and StdDev have the expected meanings.  For uniform and
2852  * random distributions the Mean is the center point of the
2853  * range and the StdDev is 1/2 the range.  A dimension with
2854  * zero standard deviation cannot be statistically analyzed.
2855  * In this case, a pseudo-analysis is used.
2856  * The Buckets data structure is filled in.
2857  * @param Buckets histogram buckets to count samples
2858  * @param Cluster cluster whose samples are being analyzed
2859  * @param Dim dimension of samples which is being analyzed
2860  * @param ParamDesc description of the dimension
2861  * @param Mean  "mean" of the distribution
2862  * @param StdDev  "standard deviation" of the distribution
2863  */
FillBuckets(BUCKETS * Buckets,CLUSTER * Cluster,uint16_t Dim,PARAM_DESC * ParamDesc,float Mean,float StdDev)2864 static void FillBuckets(BUCKETS *Buckets, CLUSTER *Cluster, uint16_t Dim, PARAM_DESC *ParamDesc,
2865                         float Mean, float StdDev) {
2866   uint16_t BucketID;
2867   int i;
2868   LIST SearchState;
2869   SAMPLE *Sample;
2870 
2871   // initialize the histogram bucket counts to 0
2872   for (i = 0; i < Buckets->NumberOfBuckets; i++) {
2873     Buckets->Count[i] = 0;
2874   }
2875 
2876   if (StdDev == 0.0) {
2877     /* if the standard deviation is zero, then we can't statistically
2878    analyze the cluster.  Use a pseudo-analysis: samples exactly on
2879    the mean are distributed evenly across all buckets.  Samples greater
2880    than the mean are placed in the last bucket; samples less than the
2881    mean are placed in the first bucket. */
2882 
2883     InitSampleSearch(SearchState, Cluster);
2884     i = 0;
2885     while ((Sample = NextSample(&SearchState)) != nullptr) {
2886       if (Sample->Mean[Dim] > Mean) {
2887         BucketID = Buckets->NumberOfBuckets - 1;
2888       } else if (Sample->Mean[Dim] < Mean) {
2889         BucketID = 0;
2890       } else {
2891         BucketID = i;
2892       }
2893       Buckets->Count[BucketID] += 1;
2894       i++;
2895       if (i >= Buckets->NumberOfBuckets) {
2896         i = 0;
2897       }
2898     }
2899   } else {
2900     // search for all samples in the cluster and add to histogram buckets
2901     InitSampleSearch(SearchState, Cluster);
2902     while ((Sample = NextSample(&SearchState)) != nullptr) {
2903       switch (Buckets->Distribution) {
2904         case normal:
2905           BucketID = NormalBucket(ParamDesc, Sample->Mean[Dim], Mean, StdDev);
2906           break;
2907         case D_random:
2908         case uniform:
2909           BucketID = UniformBucket(ParamDesc, Sample->Mean[Dim], Mean, StdDev);
2910           break;
2911         default:
2912           BucketID = 0;
2913       }
2914       Buckets->Count[Buckets->Bucket[BucketID]] += 1;
2915     }
2916   }
2917 } // FillBuckets
2918 
2919 /**
2920  * This routine determines which bucket x falls into in the
2921  * discrete normal distribution defined by kNormalMean
2922  * and kNormalStdDev.  x values which exceed the range of
2923  * the discrete distribution are clipped.
2924  * @param ParamDesc used to identify circular dimensions
2925  * @param x value to be normalized
2926  * @param Mean  mean of normal distribution
2927  * @param StdDev  standard deviation of normal distribution
2928  * @return Bucket number into which x falls
2929  */
NormalBucket(PARAM_DESC * ParamDesc,float x,float Mean,float StdDev)2930 static uint16_t NormalBucket(PARAM_DESC *ParamDesc, float x, float Mean, float StdDev) {
2931   float X;
2932 
2933   // wraparound circular parameters if necessary
2934   if (ParamDesc->Circular) {
2935     if (x - Mean > ParamDesc->HalfRange) {
2936       x -= ParamDesc->Range;
2937     } else if (x - Mean < -ParamDesc->HalfRange) {
2938       x += ParamDesc->Range;
2939     }
2940   }
2941 
2942   X = ((x - Mean) / StdDev) * kNormalStdDev + kNormalMean;
2943   if (X < 0) {
2944     return 0;
2945   }
2946   if (X > BUCKETTABLESIZE - 1) {
2947     return (static_cast<uint16_t>(BUCKETTABLESIZE - 1));
2948   }
2949   return static_cast<uint16_t>(floor(static_cast<double>(X)));
2950 } // NormalBucket
2951 
2952 /**
2953  * This routine determines which bucket x falls into in the
2954  * discrete uniform distribution defined by
2955  * BUCKETTABLESIZE.  x values which exceed the range of
2956  * the discrete distribution are clipped.
2957  * @param ParamDesc used to identify circular dimensions
2958  * @param x value to be normalized
2959  * @param Mean  center of range of uniform distribution
2960  * @param StdDev  1/2 the range of the uniform distribution
2961  * @return Bucket number into which x falls
2962  */
UniformBucket(PARAM_DESC * ParamDesc,float x,float Mean,float StdDev)2963 static uint16_t UniformBucket(PARAM_DESC *ParamDesc, float x, float Mean, float StdDev) {
2964   float X;
2965 
2966   // wraparound circular parameters if necessary
2967   if (ParamDesc->Circular) {
2968     if (x - Mean > ParamDesc->HalfRange) {
2969       x -= ParamDesc->Range;
2970     } else if (x - Mean < -ParamDesc->HalfRange) {
2971       x += ParamDesc->Range;
2972     }
2973   }
2974 
2975   X = ((x - Mean) / (2 * StdDev) * BUCKETTABLESIZE + BUCKETTABLESIZE / 2.0);
2976   if (X < 0) {
2977     return 0;
2978   }
2979   if (X > BUCKETTABLESIZE - 1) {
2980     return static_cast<uint16_t>(BUCKETTABLESIZE - 1);
2981   }
2982   return static_cast<uint16_t>(floor(static_cast<double>(X)));
2983 } // UniformBucket
2984 
2985 /**
2986  * This routine performs a chi-square goodness of fit test
2987  * on the histogram data in the Buckets data structure.
2988  * true is returned if the histogram matches the probability
2989  * distribution which was specified when the Buckets
2990  * structure was originally created.  Otherwise false is
2991  * returned.
2992  * @param Buckets   histogram data to perform chi-square test on
2993  * @return true if samples match distribution, false otherwise
2994  */
DistributionOK(BUCKETS * Buckets)2995 static bool DistributionOK(BUCKETS *Buckets) {
2996   float FrequencyDifference;
2997   float TotalDifference;
2998   int i;
2999 
3000   // compute how well the histogram matches the expected histogram
3001   TotalDifference = 0.0;
3002   for (i = 0; i < Buckets->NumberOfBuckets; i++) {
3003     FrequencyDifference = Buckets->Count[i] - Buckets->ExpectedCount[i];
3004     TotalDifference += (FrequencyDifference * FrequencyDifference) / Buckets->ExpectedCount[i];
3005   }
3006 
3007   // test to see if the difference is more than expected
3008   if (TotalDifference > Buckets->ChiSquared) {
3009     return false;
3010   } else {
3011     return true;
3012   }
3013 } // DistributionOK
3014 
3015 /**
3016  * This routine computes the degrees of freedom that should
3017  * be used in a chi-squared test with the specified number of
3018  * histogram buckets.  The result is always rounded up to
3019  * the next even number so that the value of chi-squared can be
3020  * computed more easily.  This will cause the value of
3021  * chi-squared to be higher than the optimum value, resulting
3022  * in the chi-square test being more lenient than optimum.
3023  * @param Distribution    distribution being tested for
3024  * @param HistogramBuckets  number of buckets in chi-square test
3025  * @return The number of degrees of freedom for a chi-square test
3026  */
DegreesOfFreedom(DISTRIBUTION Distribution,uint16_t HistogramBuckets)3027 static uint16_t DegreesOfFreedom(DISTRIBUTION Distribution, uint16_t HistogramBuckets) {
3028   static uint8_t DegreeOffsets[] = {3, 3, 1};
3029 
3030   uint16_t AdjustedNumBuckets;
3031 
3032   AdjustedNumBuckets = HistogramBuckets - DegreeOffsets[static_cast<int>(Distribution)];
3033   if (Odd(AdjustedNumBuckets)) {
3034     AdjustedNumBuckets++;
3035   }
3036   return (AdjustedNumBuckets);
3037 
3038 } // DegreesOfFreedom
3039 
3040 /**
3041  * This routine multiplies each ExpectedCount histogram entry
3042  * by NewSampleCount/OldSampleCount so that the histogram
3043  * is now adjusted to the new sample count.
3044  * @param Buckets histogram data structure to adjust
3045  * @param NewSampleCount  new sample count to adjust to
3046  */
AdjustBuckets(BUCKETS * Buckets,uint32_t NewSampleCount)3047 static void AdjustBuckets(BUCKETS *Buckets, uint32_t NewSampleCount) {
3048   int i;
3049   double AdjustFactor;
3050 
3051   AdjustFactor =
3052       ((static_cast<double>(NewSampleCount)) / (static_cast<double>(Buckets->SampleCount)));
3053 
3054   for (i = 0; i < Buckets->NumberOfBuckets; i++) {
3055     Buckets->ExpectedCount[i] *= AdjustFactor;
3056   }
3057 
3058   Buckets->SampleCount = NewSampleCount;
3059 
3060 } // AdjustBuckets
3061 
3062 /**
3063  * This routine sets the bucket counts in the specified histogram
3064  * to zero.
3065  * @param Buckets histogram data structure to init
3066  */
InitBuckets(BUCKETS * Buckets)3067 static void InitBuckets(BUCKETS *Buckets) {
3068   int i;
3069 
3070   for (i = 0; i < Buckets->NumberOfBuckets; i++) {
3071     Buckets->Count[i] = 0;
3072   }
3073 
3074 } // InitBuckets
3075 
3076 /**
3077  * This routine is used to search a list of structures which
3078  * hold pre-computed chi-squared values for a chi-squared
3079  * value whose corresponding alpha field matches the alpha
3080  * field of SearchKey.
3081  *
3082  * It is called by the list search routines.
3083  *
3084  * @param arg1 chi-squared struct being tested for a match
3085  * @param arg2 chi-squared struct that is the search key
3086  * @return true if ChiStruct's Alpha matches SearchKey's Alpha
3087  */
AlphaMatch(void * arg1,void * arg2)3088 static int AlphaMatch(void *arg1,   // CHISTRUCT *ChiStruct,
3089                       void *arg2) { // CHISTRUCT *SearchKey)
3090   auto *ChiStruct = static_cast<CHISTRUCT *>(arg1);
3091   auto *SearchKey = static_cast<CHISTRUCT *>(arg2);
3092 
3093   return (ChiStruct->Alpha == SearchKey->Alpha);
3094 
3095 } // AlphaMatch
3096 
3097 /**
3098  * This routine attempts to find an x value at which Function
3099  * goes to zero (i.e. a root of the function).  It will only
3100  * work correctly if a solution actually exists and there
3101  * are no extrema between the solution and the InitialGuess.
3102  * The algorithms used are extremely primitive.
3103  *
3104  * @param Function  function whose zero is to be found
3105  * @param FunctionParams  arbitrary data to pass to function
3106  * @param InitialGuess  point to start solution search at
3107  * @param Accuracy  maximum allowed error
3108  * @return Solution of function (x for which f(x) = 0).
3109  */
Solve(SOLVEFUNC Function,void * FunctionParams,double InitialGuess,double Accuracy)3110 static double Solve(SOLVEFUNC Function, void *FunctionParams, double InitialGuess, double Accuracy)
3111 #define INITIALDELTA 0.1
3112 #define DELTARATIO 0.1
3113 {
3114   double x;
3115   double f;
3116   double Slope;
3117   double Delta;
3118   double NewDelta;
3119   double xDelta;
3120   double LastPosX, LastNegX;
3121 
3122   x = InitialGuess;
3123   Delta = INITIALDELTA;
3124   LastPosX = FLT_MAX;
3125   LastNegX = -FLT_MAX;
3126   f = (*Function)(static_cast<CHISTRUCT *>(FunctionParams), x);
3127   while (Abs(LastPosX - LastNegX) > Accuracy) {
3128     // keep track of outer bounds of current estimate
3129     if (f < 0) {
3130       LastNegX = x;
3131     } else {
3132       LastPosX = x;
3133     }
3134 
3135     // compute the approx. slope of f(x) at the current point
3136     Slope = ((*Function)(static_cast<CHISTRUCT *>(FunctionParams), x + Delta) - f) / Delta;
3137 
3138     // compute the next solution guess */
3139     xDelta = f / Slope;
3140     x -= xDelta;
3141 
3142     // reduce the delta used for computing slope to be a fraction of
3143     // the amount moved to get to the new guess
3144     NewDelta = Abs(xDelta) * DELTARATIO;
3145     if (NewDelta < Delta) {
3146       Delta = NewDelta;
3147     }
3148 
3149     // compute the value of the function at the new guess
3150     f = (*Function)(static_cast<CHISTRUCT *>(FunctionParams), x);
3151   }
3152   return (x);
3153 
3154 } // Solve
3155 
3156 /**
3157  * This routine computes the area under a chi density curve
3158  * from 0 to x, minus the desired area under the curve.  The
3159  * number of degrees of freedom of the chi curve is specified
3160  * in the ChiParams structure.  The desired area is also
3161  * specified in the ChiParams structure as Alpha (or 1 minus
3162  * the desired area).  This routine is intended to be passed
3163  * to the Solve() function to find the value of chi-squared
3164  * which will yield a desired area under the right tail of
3165  * the chi density curve.  The function will only work for
3166  * even degrees of freedom.  The equations are based on
3167  * integrating the chi density curve in parts to obtain
3168  * a series that can be used to compute the area under the
3169  * curve.
3170  * @param ChiParams contains degrees of freedom and alpha
3171  * @param x   value of chi-squared to evaluate
3172  * @return Error between actual and desired area under the chi curve.
3173  */
ChiArea(CHISTRUCT * ChiParams,double x)3174 static double ChiArea(CHISTRUCT *ChiParams, double x) {
3175   int i, N;
3176   double SeriesTotal;
3177   double Denominator;
3178   double PowerOfx;
3179 
3180   N = ChiParams->DegreesOfFreedom / 2 - 1;
3181   SeriesTotal = 1;
3182   Denominator = 1;
3183   PowerOfx = 1;
3184   for (i = 1; i <= N; i++) {
3185     Denominator *= 2 * i;
3186     PowerOfx *= x;
3187     SeriesTotal += PowerOfx / Denominator;
3188   }
3189   return ((SeriesTotal * exp(-0.5 * x)) - ChiParams->Alpha);
3190 
3191 } // ChiArea
3192 
3193 /**
3194  * This routine looks at all samples in the specified cluster.
3195  * It computes a running estimate of the percentage of the
3196  * characters which have more than 1 sample in the cluster.
3197  * When this percentage exceeds MaxIllegal, true is returned.
3198  * Otherwise false is returned.  The CharID
3199  * fields must contain integers which identify the training
3200  * characters which were used to generate the sample.  One
3201  * integer is used for each sample.  The NumChar field in
3202  * the Clusterer must contain the number of characters in the
3203  * training set.  All CharID fields must be between 0 and
3204  * NumChar-1.  The main function of this routine is to help
3205  * identify clusters which need to be split further, i.e. if
3206  * numerous training characters have 2 or more features which are
3207  * contained in the same cluster, then the cluster should be
3208  * split.
3209  *
3210  * @param Clusterer data structure holding cluster tree
3211  * @param Cluster   cluster containing samples to be tested
3212  * @param MaxIllegal  max percentage of samples allowed to have
3213  *        more than 1 feature in the cluster
3214  * @return true if the cluster should be split, false otherwise.
3215  */
MultipleCharSamples(CLUSTERER * Clusterer,CLUSTER * Cluster,float MaxIllegal)3216 static bool MultipleCharSamples(CLUSTERER *Clusterer, CLUSTER *Cluster, float MaxIllegal)
3217 #define ILLEGAL_CHAR 2
3218 {
3219   static std::vector<uint8_t> CharFlags;
3220   LIST SearchState;
3221   SAMPLE *Sample;
3222   int32_t CharID;
3223   int32_t NumCharInCluster;
3224   int32_t NumIllegalInCluster;
3225   float PercentIllegal;
3226 
3227   // initial estimate assumes that no illegal chars exist in the cluster
3228   NumCharInCluster = Cluster->SampleCount;
3229   NumIllegalInCluster = 0;
3230 
3231   if (Clusterer->NumChar > CharFlags.size()) {
3232     CharFlags.resize(Clusterer->NumChar);
3233   }
3234 
3235   for (auto &CharFlag : CharFlags) {
3236     CharFlag = false;
3237   }
3238 
3239   // find each sample in the cluster and check if we have seen it before
3240   InitSampleSearch(SearchState, Cluster);
3241   while ((Sample = NextSample(&SearchState)) != nullptr) {
3242     CharID = Sample->CharID;
3243     if (CharFlags[CharID] == false) {
3244       CharFlags[CharID] = true;
3245     } else {
3246       if (CharFlags[CharID] == true) {
3247         NumIllegalInCluster++;
3248         CharFlags[CharID] = ILLEGAL_CHAR;
3249       }
3250       NumCharInCluster--;
3251       PercentIllegal = static_cast<float>(NumIllegalInCluster) / NumCharInCluster;
3252       if (PercentIllegal > MaxIllegal) {
3253         destroy(SearchState);
3254         return true;
3255       }
3256     }
3257   }
3258   return false;
3259 
3260 } // MultipleCharSamples
3261 
3262 /**
3263  * Compute the inverse of a matrix using LU decomposition with partial pivoting.
3264  * The return value is the sum of norms of the off-diagonal terms of the
3265  * product of a and inv. (A measure of the error.)
3266  */
InvertMatrix(const float * input,int size,float * inv)3267 static double InvertMatrix(const float *input, int size, float *inv) {
3268   // Allocate memory for the 2D arrays.
3269   GENERIC_2D_ARRAY<double> U(size, size, 0.0);
3270   GENERIC_2D_ARRAY<double> U_inv(size, size, 0.0);
3271   GENERIC_2D_ARRAY<double> L(size, size, 0.0);
3272 
3273   // Initialize the working matrices. U starts as input, L as I and U_inv as O.
3274   int row;
3275   int col;
3276   for (row = 0; row < size; row++) {
3277     for (col = 0; col < size; col++) {
3278       U[row][col] = input[row * size + col];
3279       L[row][col] = row == col ? 1.0 : 0.0;
3280       U_inv[row][col] = 0.0;
3281     }
3282   }
3283 
3284   // Compute forward matrix by inversion by LU decomposition of input.
3285   for (col = 0; col < size; ++col) {
3286     // Find best pivot
3287     int best_row = 0;
3288     double best_pivot = -1.0;
3289     for (row = col; row < size; ++row) {
3290       if (Abs(U[row][col]) > best_pivot) {
3291         best_pivot = Abs(U[row][col]);
3292         best_row = row;
3293       }
3294     }
3295     // Exchange pivot rows.
3296     if (best_row != col) {
3297       for (int k = 0; k < size; ++k) {
3298         double tmp = U[best_row][k];
3299         U[best_row][k] = U[col][k];
3300         U[col][k] = tmp;
3301         tmp = L[best_row][k];
3302         L[best_row][k] = L[col][k];
3303         L[col][k] = tmp;
3304       }
3305     }
3306     // Now do the pivot itself.
3307     for (row = col + 1; row < size; ++row) {
3308       double ratio = -U[row][col] / U[col][col];
3309       for (int j = col; j < size; ++j) {
3310         U[row][j] += U[col][j] * ratio;
3311       }
3312       for (int k = 0; k < size; ++k) {
3313         L[row][k] += L[col][k] * ratio;
3314       }
3315     }
3316   }
3317   // Next invert U.
3318   for (col = 0; col < size; ++col) {
3319     U_inv[col][col] = 1.0 / U[col][col];
3320     for (row = col - 1; row >= 0; --row) {
3321       double total = 0.0;
3322       for (int k = col; k > row; --k) {
3323         total += U[row][k] * U_inv[k][col];
3324       }
3325       U_inv[row][col] = -total / U[row][row];
3326     }
3327   }
3328   // Now the answer is U_inv.L.
3329   for (row = 0; row < size; row++) {
3330     for (col = 0; col < size; col++) {
3331       double sum = 0.0;
3332       for (int k = row; k < size; ++k) {
3333         sum += U_inv[row][k] * L[k][col];
3334       }
3335       inv[row * size + col] = sum;
3336     }
3337   }
3338   // Check matrix product.
3339   double error_sum = 0.0;
3340   for (row = 0; row < size; row++) {
3341     for (col = 0; col < size; col++) {
3342       double sum = 0.0;
3343       for (int k = 0; k < size; ++k) {
3344         sum += static_cast<double>(input[row * size + k]) * inv[k * size + col];
3345       }
3346       if (row != col) {
3347         error_sum += Abs(sum);
3348       }
3349     }
3350   }
3351   return error_sum;
3352 }
3353 
3354 } // namespace tesseract
3355