1 /*
2
3 HyPhy - Hypothesis Testing Using Phylogenies.
4
5 Copyright (C) 1997-now
6 Core Developers:
7 Sergei L Kosakovsky Pond (sergeilkp@icloud.com)
8 Art FY Poon (apoon42@uwo.ca)
9 Steven Weaver (sweaver@temple.edu)
10
11 Module Developers:
12 Lance Hepler (nlhepler@gmail.com)
13 Martin Smith (martin.audacis@gmail.com)
14
15 Significant contributions from:
16 Spencer V Muse (muse@stat.ncsu.edu)
17 Simon DW Frost (sdf22@cam.ac.uk)
18
19 Permission is hereby granted, free of charge, to any person obtaining a
20 copy of this software and associated documentation files (the
21 "Software"), to deal in the Software without restriction, including
22 without limitation the rights to use, copy, modify, merge, publish,
23 distribute, sublicense, and/or sell copies of the Software, and to
24 permit persons to whom the Software is furnished to do so, subject to
25 the following conditions:
26
27 The above copyright notice and this permission notice shall be included
28 in all copies or substantial portions of the Software.
29
30 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
31 OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
32 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
33 IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
34 CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
35 TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
36 SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
37
38 */
39
40
41 #if not defined __AFYP_REWRITE_BGM__
42
43 #ifdef __HYPHYQT__
44 #include "hyphymain.h"
45 #endif
46
47 #if defined __MAC__ || defined __WINDOZE__ || defined __HYPHY_GTK__
48 #include "HYConsoleWindow.h"
49 #include "HYDialogs.h"
50
51 #endif
52
53
54 //#define __AFYP_DEVELOPMENT__
55 #define __MISSING_DATA__
56
57
58 #include "bgm.h"
59
60 _String _HYBgm_BAN_PARENT_KEY ("BanParent"),
61 _HYBgm_BAN_CHILD_KEY ("BanChild"),
62 _HYBgm_ENFORCE_PARENT_KEY ("EnforceParent"),
63 _HYBgm_ENFORCE_CHILD_KEY ("EnforceChild"),
64
65 _HYBgm_NODE_INDEX_KEY ("NodeID"),
66 _HYBgm_PRIOR_SIZE_KEY ("PriorSize"),
67 _HYBgm_MAX_PARENT_KEY ("MaxParents"),
68
69 _HYBgm_PRIOR_MEAN_KEY ("PriorMean"), /* for continuous (Gaussian) nodes */
70 _HYBgm_PRIOR_PREC_KEY ("PriorVar"),
71
72 /*SLKP 20070926; add string constants for progress report updates */
73 _HYBgm_STATUS_LINE_MCMC ("Running Bgm MCMC"),
74 _HYBgm_STATUS_LINE_MCMC_DONE ("Finished Bgm MCMC"),
75 _HYBgm_STATUS_LINE_CACHE ("Caching Bgm scores"),
76 _HYBgm_STATUS_LINE_CACHE_DONE ("Done caching Bgm scores"),
77 /*SLKP*/
78
79 _HYBgm_MPI_CACHING ("USE_MPI_CACHING"),
80
81 /* maxParentString ("Bgm_MAXIMUM_PARENTS"), */
82 maxNumRestart ("BGM_NUM_RESTARTS"),
83 numRandomize ("BGM_NUM_RANDOMIZE"),
84 useNodeOrder ("BGM_USE_NODEORDER"),
85 bgmOptimizationMethod ("BGM_OPTIMIZATION_METHOD"),
86
87 useMPIcaching ("USE_MPI_CACHING"),
88
89 mcmcNumChains ("BGM_MCMC_NCHAINS"), // re-use parameter
90 mcmcTemperature ("BGM_MCMC_TEMPERATURE"),
91 mcmcSteps ("BGM_MCMC_DURATION"),
92 mcmcBurnin ("BGM_MCMC_BURNIN"),
93 mcmcSamples ("BGM_MCMC_SAMPLES"),
94
95 mcemMaxSteps ("BGM_MCEM_MAXSTEPS"),
96 mcemBurnin ("BGM_MCEM_BURNIN"),
97 mcemSampleSize ("BGM_MCEM_SAMPLES");
98
99
100 #ifdef __UNIX__
101
102 void ConsoleBGMStatus (_String, hyFloat, _String * fileName);
103
104 //__________________________________________________________________________________
105
ConsoleBGMStatus(_String statusLine,hyFloat percentDone,_String * fileName=nil)106 void ConsoleBGMStatus (_String statusLine, hyFloat percentDone, _String * fileName = nil)
107 {
108 FILE *outFile = fileName?doFileOpen (fileName->sData,"w"):nil;
109
110 _String reportLine (statusLine);
111
112
113
114
115 if (percentDone >= 0.0) {
116 reportLine = reportLine & ". " & percentDone & "% done.";
117 }
118
119 if (outFile) {
120 fprintf (outFile,"%s", reportLine.sData);
121 } else if (verbosity_level == 1) {
122 printf ("\033\015 %s", reportLine.sData);
123 }
124
125 if (percentDone < -1.5) {
126 printf ("\033\015 ");
127 setvbuf (stdout,nil,_IOLBF,1024);
128 } else if (percentDone < -0.5) {
129 setvbuf (stdout,nil, _IONBF,1);
130 }
131 if (outFile) {
132 fclose (outFile);
133 }
134
135 }
136
137 #endif
138
139
140 //___________________________________________________________________________________________
141 // afyp Oct. 15, 2008, replaced wrapper function for _Constant member function
142 // with this local function.
LnGamma(hyFloat theValue)143 hyFloat Bgm::LnGamma(hyFloat theValue)
144 {
145 if (theValue <= 0) {
146
147
148 _String oops ("ERROR: Requested Bgm::LnGamma(x) for x <= 0.");
149 WarnError (oops);
150
151 return 0;
152 }
153
154 static hyFloat lngammaCoeff [6] = { 76.18009172947146,
155 -86.50532032941677,
156 24.01409824083091,
157 - 1.231739572450155,
158 0.1208650973866179e-2,
159 - 0.5395239384953e-5
160 };
161
162 static hyFloat lookUpTable [20] = { 0., 0., 0.6931472, 1.7917595, 3.1780538,
163 4.7874917, 6.5792512, 8.5251614, 10.6046029, 12.8018275,
164 15.1044126, 17.5023078, 19.9872145, 22.5521639, 25.1912212,
165 27.8992714, 30.6718601, 33.5050735, 36.3954452, 39.3398842
166 };
167
168 // use look-up table for small integer values
169 if (theValue <= 20 && (theValue - (long)theValue) > 0.) {
170 return (lookUpTable [(long) theValue - 1]);
171 }
172
173
174 // else do it the hard way
175 hyFloat x, y, tmp, ser;
176
177 y = x = theValue;
178 tmp = x + 5.5;
179 tmp -= (x+0.5) * log(tmp);
180 ser = 1.000000000190015;
181
182 for (long j = 0; j <= 5; j++) {
183 ser += lngammaCoeff[j] / ++y;
184 }
185
186 return (-tmp + log(2.506628274631005*ser/x));
187 }
188
189
190
191 //___________________________________________________________________________________________
Bgm(_AssociativeList * dnodes,_AssociativeList * cnodes)192 Bgm::Bgm(_AssociativeList * dnodes, _AssociativeList * cnodes)
193 {
194 _String errorMessage;
195
196 long num_discrete = dnodes->avl.countitems(),
197 num_continuous = cnodes->avl.countitems(),
198 node_index;
199
200 num_nodes = num_discrete + num_continuous; // set member variable
201
202
203
204 _Constant * node_id, /* pointers into HBL associative array arguments */
205 * mp,
206 * size, * mean, * precision;
207
208 _AssociativeList * discreteNode, * continuousNode;
209
210
211
212 // set member variables to default values
213 calc_bit = new _Constant (); // for calculating LnGamma in member functions
214 max_max_parents = 0; // record the most parents a node can have
215 obsData = NULL;
216 obsWeights = NULL;
217
218
219
220 // extract number of variables in data
221 CreateMatrix (&dag, num_nodes, num_nodes, false, true, false); // allocate space for matrices
222 CreateMatrix (&banned_edges, num_nodes, num_nodes, false, true, false);
223 CreateMatrix (&enforced_edges, num_nodes, num_nodes, false, true, false);
224 CreateMatrix (&prior_sample_size, num_nodes, 1, false, true, false); // for storing floats as hyFloat objects
225 CreateMatrix (&prior_mean, num_nodes, 1, false, true, false);
226 CreateMatrix (&prior_precision, num_nodes, 1, false, true, false);
227
228
229 // allocate space for _SimpleList objects
230 is_discrete.Populate (num_nodes, 0, 0);
231 num_levels.Populate (num_nodes, 0, 0);
232 max_parents.Populate (num_nodes, 0, 0);
233 has_missing.Populate (num_nodes, 0, 0);
234
235
236
237 // initialize discrete nodes
238 for (long dn = 0; dn < num_discrete; dn++) {
239 discreteNode = (_AssociativeList *) (dnodes->GetByKey (dn, ASSOCIATIVE_LIST));
240
241 if (discreteNode) {
242 node_id = (_Constant *) (discreteNode->GetByKey (_HYBgm_NODE_INDEX_KEY, NUMBER));
243 mp = (_Constant *) (discreteNode->GetByKey (_HYBgm_MAX_PARENT_KEY, NUMBER));
244 size = (_Constant *) (discreteNode->GetByKey (_HYBgm_PRIOR_SIZE_KEY, NUMBER));
245
246 if (node_id && mp && size) {
247 node_index = (long) (node_id->Value());
248
249 is_discrete.list_data[node_index] = 1.;
250 max_parents.list_data[node_index] = (long) mp->Value();
251
252 if ((long) mp->Value() > max_max_parents) {
253 max_max_parents = (long) mp->Value();
254 }
255
256 prior_sample_size.Store(node_index, 0, (long) size->Value());
257 } else {
258 errorMessage = _String ("Missing key (NodeID, MaxParents, PriorSize) in associative array for discrete node ")
259 & dn;
260 break;
261 }
262 } else {
263 errorMessage = _String("Failed to retrieve discrete node specification at index ") & dn;
264 break;
265 }
266 }
267 if (errorMessage.sLength) {
268 WarnError (errorMessage);
269 errorMessage = _String(); // reset to empty string
270 }
271
272
273
274 // initialize continuous nodes
275 for (long cn = 0; cn < num_continuous; cn++) {
276 continuousNode = (_AssociativeList *) (cnodes->GetByKey (cn, ASSOCIATIVE_LIST));
277
278 if (continuousNode) {
279 node_id = (_Constant *) (continuousNode->GetByKey (_HYBgm_NODE_INDEX_KEY, NUMBER));
280 mp = (_Constant *) (continuousNode->GetByKey (_HYBgm_MAX_PARENT_KEY, NUMBER));
281 size = (_Constant *) (continuousNode->GetByKey (_HYBgm_PRIOR_SIZE_KEY, NUMBER));
282 mean = (_Constant *) (continuousNode->GetByKey (_HYBgm_PRIOR_MEAN_KEY, NUMBER));
283 precision = (_Constant *) (continuousNode->GetByKey (_HYBgm_PRIOR_PREC_KEY, NUMBER));
284
285 if (node_id && mp && size && mean && precision) {
286 node_index = (long) (node_id->Value());
287
288 is_discrete.list_data[node_index] = 0.;
289 max_parents.list_data[node_index] = (long) mp->Value();
290
291 if (mp->Value() > max_max_parents) {
292 max_max_parents = (long) mp->Value();
293 }
294
295 prior_sample_size.Store (node_index, 0, (hyFloat) size->Value());
296 prior_mean.Store (node_index, 0, (hyFloat) mean->Value());
297 prior_precision.Store (node_index, 0, (hyFloat) precision->Value());
298
299 } else {
300 errorMessage = _String ("Missing key (NodeID, MaxParents, PriorSize, PriorMean, PriorVar)")
301 & "in associative array for continuous node " & cn;
302 break;
303 }
304 } else {
305 errorMessage = _String ("Failed to retrieve continuous node specification at index ") & cn;
306 break;
307 }
308 }
309
310 if (errorMessage.sLength) {
311 WarnError (errorMessage);
312 errorMessage = _String(); // reset to empty string
313 }
314
315
316
317 // append duplicates of _List object to store pointers to _Constant, _Matrix, _NTupleStorage objects
318 _List emptyList (max_max_parents+1);
319 for (long node = 0; node < num_nodes; node++) {
320 node_scores && (&emptyList);
321 }
322
323
324 // require Bgm to recalculate node scores
325 scores_cached = FALSE;
326 }
327
328
329
330 //___________________________________________________________________________________________
~Bgm(void)331 Bgm::~Bgm (void)
332 {
333 DeleteObject (calc_bit);
334 }
335
336
337
338 //___________________________________________________________________________________________
SetDataMatrix(_Matrix * data)339 void Bgm::SetDataMatrix (_Matrix * data)
340 {
341 // reset cached node scores and edge posteriors
342 // DumpComputeLists ();
343 scores_cached = FALSE;
344
345 if (obsData) {
346 DeleteObject (obsData); // dispense with previous data matrix
347 }
348
349 obsData = data;
350
351 obsData->CheckIfSparseEnough(TRUE); // check if we can use a more compact representation of the
352 // matrix; before including this command, we suffered a
353 // noticeable slow-down in this routine. - AFYP
354
355 /*
356 // allocate space to weight matrix
357 if (obsWeights)
358 {
359 DeleteObject(obsWeights);
360 }
361 obsWeights = new _Matrix (obsData->GetHDim(), obsData->GetVDim(), false, true);
362
363 // by default all cases are weighted equally - require user to call SetWeightMatrix() if otherwise
364 for (long row = 0; row < obsData->GetHDim(); row++)
365 for (long col = 0; col < obsData->GetVDim(); col++)
366 obsWeights->Store(row, col, 1.);
367
368 */
369
370 // reset data-dependent member variables
371 for (long node = 0; node < num_nodes; node++) {
372 has_missing.list_data[node] = 0;
373 num_levels.list_data[node] = 0;
374 }
375
376
377
378 #ifdef DEBUG_SDM
379 char buf [256];
380 for (long i = 0; i < obsData->GetHDim(); i++) {
381 for (long j = 0; j < obsData->GetVDim(); j++) {
382 snprintf (buf, sizeof(buf), "%d ", (long) (*obsData)(i,j));
383 BufferToConsole (buf);
384 }
385 NLToConsole ();
386 }
387 #endif
388
389
390
391 if (obsData->GetVDim() == num_nodes) { // make sure the data matrix is compatible with graph
392 #ifdef __MISSING_DATA__
393 long nrows = obsData->GetHDim();
394
395 for (long node = 0; node < num_nodes; node++) {
396 if (is_discrete.list_data[node]) {
397 num_levels.list_data[node] = 1;
398
399 for (long obs, row = 0; row < nrows; row++) {
400 obs = (*obsData)(row, node);
401
402 if (has_missing.list_data[node] == 0 && obs < 0) { // use negative integer values to annotate missing data
403 has_missing.list_data[node] = 1;
404 continue; // skip next step to check levels
405 }
406
407 if (obs+1 > num_levels.list_data[node]) {
408 num_levels.list_data[node] = num_levels.list_data[node] + 1;
409 }
410 }
411 } else {
412 num_levels.list_data[node] = 0;
413
414 // not implementing missing data for continuous nodes yet
415 // not until I've decided on annotation anyhow :-/ afyp
416 }
417 }
418
419 #ifdef DEBUG_SDM
420 snprintf (buf, sizeof(buf), "Levels: ");
421 BufferToConsole (buf);
422 for (long i = 0; i < num_nodes; i++) {
423 snprintf (buf, sizeof(buf), "%d ", num_levels.list_data[i]);
424 BufferToConsole (buf);
425 }
426 NLToConsole ();
427
428 snprintf (buf, sizeof(buf), "Missing (0=FALSE, 1=TRUE): ");
429 BufferToConsole (buf);
430 for (long i = 0; i < num_nodes; i++) {
431 snprintf (buf, sizeof(buf), "%d ", has_missing.list_data[i]);
432 BufferToConsole (buf);
433 }
434 NLToConsole ();
435 #endif
436
437 #else
438 for (long node = 0; node < num_nodes; node++) { // for every column in matrix
439 if (is_discrete.list_data[node]) {
440 // calculate number of levels for discrete node
441 num_levels.list_data[node] = 1;
442
443 for (long obs = 0; obs < obsData->GetHDim(); obs++) {
444 // adjust for zero-indexing
445 if ( ((*obsData)(obs,node) + 1) > num_levels.list_data[node]) {
446 num_levels.list_data[node] = num_levels.list_data[node] + 1;
447 }
448 }
449 } else {
450 // continuous node defined to have no levels
451 num_levels.list_data[node] = 0;
452 }
453 }
454 #endif
455 } else {
456 _String errorMsg ("Number of variables in data matrix do not match number of nodes in graph.");
457 WarnError (errorMsg); // note, this produces a crash because the batch file proceeds to execute
458 // BGM routines without data. - AFYP
459 }
460
461
462 last_node_order.Clear(); // forget the last step taken in MCMC chain
463 best_node_order.Clear();
464
465 CacheNodeScores();
466
467 // re-allocate memory to lists
468 // InitComputeLists ();
469 }
470
471
472
473 //___________________________________________________________________________________________
SetWeightMatrix(_Matrix * weights)474 void Bgm::SetWeightMatrix (_Matrix * weights)
475 {
476 if ( obsData
477 && (long) (obsData->GetHDim()) == (long) (weights->GetHDim())
478 /*&& (long) (obsData->GetVDim()) == (long) (weights->GetVDim())*/ ) {
479 obsWeights = weights;
480 ReportWarning( _String("Set weight matrix to ") & (_String *) obsWeights->toStr() );
481 } else {
482 _String errorMsg ("Number of weights does not match number of observations in current data set.");
483 WarnError (errorMsg);
484 }
485
486 scores_cached = FALSE;
487 }
488
489
490
491
492 //___________________________________________________________________________________________
SetGraphMatrix(_Matrix * graph)493 void Bgm::SetGraphMatrix (_Matrix *graph)
494 {
495 /*
496 char bug [255];
497
498 snprintf (bug, sizeof(bug), "Entered Bgm::SetGraphMatrix()\n");
499 BufferToConsole (bug);
500 */
501 dag = (_Matrix &) (*graph); // matrix assignment
502 ReportWarning (_String("set graph matrix to:\n") & (_String *) dag.toStr() & "\n" );
503 }
504
505
SetBanMatrix(_Matrix * banMx)506 void Bgm::SetBanMatrix (_Matrix *banMx)
507 {
508 banned_edges = (_Matrix &) (*banMx);
509
510 ReportWarning (_String("Set ban matrix to ") & (_String *) banned_edges.toStr() & "\n" );
511 }
512
SetEnforceMatrix(_Matrix * enforceMx)513 void Bgm::SetEnforceMatrix (_Matrix *enforceMx)
514 {
515 enforced_edges = (_Matrix &) (*enforceMx);
516 ReportWarning (_String("Set enforce matrix to ") & (_String *) enforced_edges.toStr() & "\n" );
517 }
518
SetBestOrder(_SimpleList * orderList)519 void Bgm::SetBestOrder (_SimpleList * orderList)
520 {
521 best_node_order.Populate(num_nodes, 0, 0);
522
523 for (long i = 0; i < num_nodes; i++) {
524 best_node_order.list_data[i] = orderList->list_data[i];
525 }
526 }
527
528 //___________________________________________________________________________________________
529 // For debugging..
PrintGraph(_Matrix * g)530 void Bgm::PrintGraph (_Matrix * g)
531 {
532 char buf [255];
533
534 if (g) {
535 for (long row = 0; row < g->GetHDim(); row++) {
536 for (long col = 0; col < g->GetVDim(); col++) {
537 snprintf (buf, sizeof(buf), "%ld ", (long) (*g)(row,col));
538 BufferToConsole (buf);
539 }
540 snprintf (buf, sizeof(buf), "\n");
541 BufferToConsole (buf);
542 }
543 } else {
544 for (long row = 0; row < dag.GetHDim(); row++) {
545 for (long col = 0; col < dag.GetVDim(); col++) {
546 snprintf (buf, sizeof(buf), "%ld ", (long)dag(row,col));
547 BufferToConsole (buf);
548 }
549 snprintf (buf, sizeof(buf), "\n");
550 BufferToConsole (buf);
551 }
552 }
553 NLToConsole();
554 }
555
556
557 //___________________________________________________________________________________________
ResetGraph(_Matrix * g)558 void Bgm::ResetGraph (_Matrix * g)
559 {
560 if (g) {
561 for (long row = 0; row < num_nodes; row++) {
562 for (long col = 0; col < num_nodes; col++) {
563 g->Store (row, col, dag (row, col));
564 }
565 }
566 ReportWarning (_String("Reset graph to dag argument\n"));
567 } else {
568 for (long row = 0; row < num_nodes; row++) {
569 for (long col = 0; col < num_nodes; col++) {
570 // enforced edges must always appear in DAG
571 // if none specified, all entries are zeroed
572 dag.Store (row, col, enforced_edges (row, col));
573 }
574 }
575 }
576 }
577
578
579
580 //___________________________________________________________________________________________
MarginalSum(bool over_rows,long offset)581 long Bgm::MarginalSum (bool over_rows, long offset)
582 {
583 long res = 0;
584
585 if (over_rows)
586 for (long i = 0; i < num_nodes; i++) {
587 res += dag(offset,i); // sum of n-th row
588 }
589 else
590 for (long i = 0; i < num_nodes; i++) {
591 res += dag(i,offset); // sum of n-th column
592 }
593
594 return res;
595 }
596
597
598
599 //___________________________________________________________________________________________
600 // DEPRECATED
IsCyclic(void)601 bool Bgm::IsCyclic (void)
602 {
603 _Matrix nodes_left (num_nodes, 1, false, true);
604
605 for (long i = 0; i < num_nodes; i++) { // populate vector with 1's
606 nodes_left.Store (i,0,1.);
607 }
608
609 long one_count = num_nodes,
610 last_count,
611 num_children;
612
613 do { // until no more leaves are removed
614 last_count = one_count;
615 one_count = 0;
616
617 // find and remove all nodes with no children (i.e. leaf nodes)
618 for (long parent = 0; parent < num_nodes; parent++) {
619 if (nodes_left(parent,0) == 1.) {
620 one_count++;
621
622 num_children = 0;
623 // count all existing nodes that have this parent
624 for (long child = 0; child < num_nodes; child++)
625 if (nodes_left(child,0) == 1. && child != parent)
626 if (dag(parent,child) == 1.) {
627 num_children++;
628 }
629
630 if (num_children == 0) { // count number of nodes with this node as parent
631 nodes_left.Store (parent, 0, 0.);
632 one_count--;
633 }
634 }
635 }
636 } while (one_count < last_count);
637
638
639 if (one_count > 0) {
640 return TRUE; // dag is cyclic
641 } else {
642 return FALSE; // removed all nodes as leaves, dag is acyclic
643 }
644 }
645
646
647
648
649 //___________________________________________________________________________________________
RandomizeGraph(_Matrix * graphMx,_SimpleList * order,long num_steps,bool fixed_order)650 void Bgm::RandomizeGraph (_Matrix * graphMx, _SimpleList * order, long num_steps, bool fixed_order)
651 {
652 long step = 0, fail = 0;
653
654
655 // convert order into matrix format of edge permissions for more rapid look-up later /* DEBUGGING ONLY */
656 #ifdef __DEBUG_RG__
657 long mode = 0;
658 _Matrix orderMx (num_nodes, num_nodes, false, true);
659
660 for (long p_rank = 0; p_rank < num_nodes; p_rank++) {
661 for (long c_rank = 0; c_rank < num_nodes; c_rank++) {
662 // use actual node id's to index into matrix - 1 indicates a permitted edge
663 orderMx.Store (order->list_data[p_rank], order->list_data[c_rank], p_rank > c_rank ? 1 : 0);
664 }
665 }
666 #endif
667
668
669 // before we do anything, make sure that graph complies with order /* DEBUGGING */
670 // calculate number of parents for each child for rapid access later
671 _SimpleList num_parents;
672
673 num_parents.Populate (num_nodes, 0, 0);
674
675 for (long child = 0; child < num_nodes; child++) {
676 for (long parent = 0; parent < num_nodes; parent++) {
677 #ifdef __DEBUG_RG__
678 if ((*graphMx)(parent,child) > 0 && banned_edges(parent,child) > 0) {
679 WarnError (_String("BEFORE RandomizeGraph() there is a banned edge: ") & parent & "->" & child & " mode " & mode);
680 break;
681 }
682 if ((*graphMx)(parent,child) == 0 && enforced_edges(parent,child) > 0) {
683 WarnError (_String("BEFORE RandomizeGraph() there is a missing enforced edge: ") & parent & "->" & child);
684 break;
685 }
686
687 if ( (*graphMx)(parent, child)==1 && orderMx(parent, child)==0 ) {
688 WarnError (_String ("Order-network discrepancy at start of RandomizeGraph() at edge ") & parent & "->" & child);
689 break;
690 }
691 #endif
692
693 if ( (*graphMx)(parent, child) > 0) {
694 num_parents.list_data[child]++;
695 }
696 }
697 // end for
698
699 if (num_parents.list_data[child] > max_parents.list_data[child]) {
700 WarnError (_String ("Number of parents exceeds maximum BEFORE randomization of graph at node ") & child & " (" & num_parents.list_data[child] & " > " & max_parents.list_data[child] & ")\n" );
701 break;
702 }
703 }
704
705
706
707
708 // randomize graph
709 long p_rank, c_rank, parent, child;
710
711 do {
712 // a fail-safe to avoid infinite loops
713 if (fail > MAX_FAIL_RANDOMIZE) {
714 WarnError(_String("Bgm::RandomizeGraph() failed to modify the graph in GraphMCMC() after MAX_FAIL_RANDOMIZE (") & (long)MAX_FAIL_RANDOMIZE & ") attempts.\n");
715 break;
716 }
717
718 // pick a random edge permitted under ordering
719 p_rank = (genrand_int32() % (num_nodes-1)) + 1; // shift right to not include lowest node in order
720 c_rank = genrand_int32() % p_rank;
721
722 child = order->list_data[c_rank];
723 parent = order->list_data[p_rank];
724
725
726 if (fixed_order || genrand_real2() > RANDOMIZE_PROB_SWAP) { // attempt an add or delete
727 if ( (*graphMx)(parent,child) == 0 && !banned_edges(parent,child)) { // add an edge
728 #ifdef __DEBUG_RG__
729 mode = 0;
730 #endif
731 if (num_parents.list_data[child] == max_parents.list_data[child]) { // child cannot accept any additional edges
732 // move an edge from an existing parent to target parent
733 _SimpleList removeable_edges;
734
735 // build a list of current parents
736 for (long par = 0; par < num_nodes; par++)
737 if ((*graphMx)(par,child) && !enforced_edges(par,child)) {
738 removeable_edges << par;
739 }
740
741 if (removeable_edges.lLength > 0) {
742 // shuffle the list and remove the first parent
743 removeable_edges.Permute(1);
744 graphMx->Store (removeable_edges.list_data[0], child, 0.);
745 graphMx->Store (parent, child, 1.);
746 step++;
747 } else {
748 // none of the edges can be removed
749 fail++;
750 }
751 } else {
752 // child can accept another edge
753 graphMx->Store (parent, child, 1.);
754 num_parents.list_data[child]++;
755 step++;
756 }
757 } else if ( (*graphMx)(parent,child) == 1 && !enforced_edges(parent,child)) { // delete an edge
758 #ifdef __DEBUG_RG__
759 mode = 1;
760 #endif
761 graphMx->Store (parent, child, 0.);
762 num_parents.list_data[child]--;
763 step++;
764 } else {
765 fail++;
766 }
767 } else { // swap nodes in ordering and flip edge if present
768 long ok_to_go = 1;
769 #ifdef __DEBUG_RG__
770 mode = 2;
771 #endif
772 // edge cannot be flipped
773 if ( (*graphMx)(parent,child) == 1 && ( banned_edges(child,parent) || enforced_edges(parent,child) ) ) {
774 ok_to_go = 0;
775 }
776
777
778 // check all other nodes affected by the swap
779 if (ok_to_go) {
780 for (long bystander, i=c_rank+1; i < p_rank; i++) {
781 bystander = order->list_data[i]; // retrieve node id
782
783 if (
784 ( (*graphMx)(parent,bystander)==1 && enforced_edges (parent, bystander) ) ||
785 ( (*graphMx)(bystander,child)==1 && enforced_edges (bystander, child) ) ||
786 banned_edges (bystander,parent) || banned_edges (child,bystander)
787 ) {
788 // by flipping the parent->child edge, we would screw up ordering for
789 // an enforced edge: C < B < P becomes P < B < C where B->C or P->B is enforced
790
791 // also, a banned edge implies a node ordering constraint
792 ok_to_go = 0;
793 break;
794 }
795 }
796 }
797
798
799 // if everything checks out OK
800 if ( ok_to_go ) {
801 // flip the target edge
802 if ( (*graphMx)(parent,child) == 1 && !banned_edges(child,parent) ) {
803 graphMx->Store (parent, child, 0);
804 graphMx->Store (child, parent, 1);
805 num_parents.list_data[child]--;
806 num_parents.list_data[parent]++;
807
808 if (num_parents.list_data[parent] > max_parents.list_data[parent]) {
809 // parent cannot accept any more edges, delete one of the edges at random (including the edge to flip)
810 _SimpleList removeable_edges;
811
812 for (long par = 0; par < num_nodes; par++)
813 if ( (*graphMx)(par, parent) && !enforced_edges(par, parent) ) {
814 removeable_edges << par;
815 }
816
817 removeable_edges.Permute(1);
818 graphMx->Store (removeable_edges.list_data[0], parent, 0.);
819 num_parents.list_data[parent]--;
820 }
821
822 step++;
823 }
824 // if number of parents for parent node will exceed maximum, then the edge is deleted instead of flipped
825
826 // swap nodes in order
827 order->list_data[p_rank] = child;
828 order->list_data[c_rank] = parent; // remember to update order matrix also!
829
830
831 // flip the other edges affected by node swap
832 // child <- N ... N <- parent _________________,
833 // becomes | v
834 // parent N N child
835 // ^ |
836 // `----------------+
837 for (long bystander, i = c_rank+1; i < p_rank; i++) {
838 bystander = order->list_data[i];
839
840 if ( (*graphMx)(bystander, child) == 1 ) {
841 graphMx->Store (bystander, child, 0);
842 num_parents.list_data[child]--;
843
844 graphMx->Store (child, bystander, 1);
845 num_parents.list_data[bystander]++;
846
847 if (num_parents.list_data[bystander] > max_parents.list_data[bystander]) {
848 _SimpleList removeable_edges;
849
850 for (long par = 0; par < num_nodes; par++)
851 if ( (*graphMx)(par, bystander) && !enforced_edges(par, bystander) ) {
852 removeable_edges << par;
853 }
854
855 removeable_edges.Permute(1);
856 graphMx->Store (removeable_edges.list_data[0], bystander, 0.);
857 num_parents.list_data[bystander]--;
858 }
859 }
860
861 if ( (*graphMx)(parent,bystander) == 1) {
862 graphMx->Store (parent, bystander, 0);
863 num_parents.list_data[bystander]--;
864
865 graphMx->Store (bystander, parent, 1);
866 num_parents.list_data[parent]++;
867
868 if (num_parents.list_data[parent] > max_parents.list_data[parent]) {
869 _SimpleList removeable_edges;
870
871 for (long par = 0; par < num_nodes; par++)
872 if ( (*graphMx)(par, parent) && !enforced_edges(par, parent) ) {
873 removeable_edges << par;
874 }
875
876 removeable_edges.Permute(1);
877 graphMx->Store (removeable_edges.list_data[0], parent, 0.);
878 num_parents.list_data[parent]--;
879 }
880 }
881 }
882 #ifdef __DEBUG_RG__
883 // refresh order matrix
884 for (long p_rank = 0; p_rank < num_nodes; p_rank++) {
885 for (long c_rank = 0; c_rank < p_rank; c_rank++) {
886 orderMx.Store (order->list_data[p_rank], order->list_data[c_rank], 1);
887 }
888 }
889 #endif
890 step++;
891 } else {
892 fail++;
893 }
894 }
895 } while (step < num_steps);
896
897
898 // a final check to make sure we haven't screwed up!
899 #ifdef __DEBUG_RG__
900 num_parents.Populate (num_nodes, 0, 0);
901
902 for (long child = 0; child < num_nodes; child++) {
903 for (long parent = 0; parent < num_nodes; parent++) {
904 if ((*graphMx)(parent,child) > 0 && banned_edges(parent,child) > 0) {
905 WarnError (_String("Aw crap! RandomizeGraph() introduced a banned edge: ") & parent & "->" & child & " mode " & mode);
906 break;
907 }
908 if ((*graphMx)(parent,child) == 0 && enforced_edges(parent,child) > 0) {
909 WarnError (_String("Aw crap! RandomizeGraph() deleted an enforced edge: ") & parent & "->" & child);
910 break;
911 }
912 if ( (*graphMx)(parent, child)==1 && orderMx(parent, child)==0 ) {
913 WarnError (_String ("Order-network discrepancy at end of RandomizeGraph(), mode ") & mode);
914 break;
915 }
916
917 if ( (*graphMx)(parent, child) > 0) {
918 num_parents.list_data[child]++;
919 }
920 }
921
922 if (num_parents.list_data[child] > max_parents.list_data[child]) {
923 WarnError (_String ("Exceeded acceptable number of parents for node ") & child & ", mode " & mode);
924 break;
925 }
926 }
927 #endif
928 }
929
930
931
932 //___________________________________________________________________________________________
RandomizeDag(long num_steps)933 void Bgm::RandomizeDag (long num_steps)
934 {
935
936 for (long step = 0; step < num_steps; step++) {
937 // store current graph in case altered graph becomes cyclic
938 _Matrix reset_dag (num_nodes, num_nodes, false, true);
939 for (long h = 0; h < num_nodes; h++)
940 for (long v = 0; v < num_nodes; v++) {
941 reset_dag.Store (h,v,dag(h,v));
942 }
943
944 do {
945 for (long row = 0; row < num_nodes; row++) {
946 for (long col = 0; col < num_nodes; col++) {
947 dag.Store (row,col,reset_dag(row,col));
948 }
949 }
950
951
952 // select a random arc, discarding cycles and banned edges
953 long row = 0,
954 col = row;
955
956 while (col == row && banned_edges(row,col) == 1) {
957 row = genrand_int32() % num_nodes;
958 col = genrand_int32() % num_nodes;
959 }
960
961 // printf ("RandomizeDag() picked %d,%d\n", row, col);
962
963 if (dag(row,col) == 0.) {
964 if (dag(col,row) == 0.) { // no arc
965 // add an arc -- sum over n-th column gives number of parents for n-th child
966 if (MarginalSum(FALSE,col) < max_parents.list_data[col]) {
967 dag.Store (row,col,1.);
968 }
969 } else {
970 // reverse or remove arc
971 dag.Store (col,row,0.);
972 if (genrand_int32() % 2 == 0) {
973 dag.Store (row,col,1.);
974 }
975 }
976 } else {
977 if (dag(col,row) == 0.) {
978 // reverse or remove
979 dag.Store (row,col,0.);
980 if (genrand_int32() % 2 == 0) {
981 dag.Store (col,row,1.);
982 }
983 } else {
984 // double arc, set to one of three legal arcs
985 long option = genrand_int32() % 3;
986 if (option == 0) {
987 dag.Store (row,col,0.);
988 } else if (option == 1) {
989 dag.Store (col,row,0.);
990 } else {
991 dag.Store (row,col,0.);
992 dag.Store (col,row,0.);
993 }
994 }
995 }
996
997 // PrintGraph(nil);
998
999 } while ( IsCyclic() );
1000 }
1001 // end loop over steps
1002 }
1003
1004
1005
1006
1007 //___________________________________________________________________________________________
1008 // Wrappers to retain original functionality.
ComputeDiscreteScore(long node_id)1009 hyFloat Bgm::ComputeDiscreteScore (long node_id)
1010 {
1011 _SimpleList parents;
1012
1013 for (long par = 0; par < num_nodes; par++) {
1014 if (dag(par, node_id) == 1 && is_discrete.list_data[par]) {
1015 parents << par;
1016 }
1017 }
1018
1019 return ComputeDiscreteScore (node_id, parents);
1020 }
1021
ComputeDiscreteScore(long node_id,_Matrix * g)1022 hyFloat Bgm::ComputeDiscreteScore (long node_id, _Matrix * g)
1023 {
1024 _SimpleList parents;
1025
1026 for (long par = 0; par < num_nodes; par++) {
1027 if ((*g)(par, node_id) == 1 && is_discrete.list_data[par]) {
1028 parents << par;
1029 }
1030 }
1031
1032 return ComputeDiscreteScore (node_id, parents);
1033 }
1034
1035
1036 //___________________________________________________________________________________________
1037 //#define BGM_DEBUG_CDS
ComputeDiscreteScore(long node_id,_SimpleList & parents)1038 hyFloat Bgm::ComputeDiscreteScore (long node_id, _SimpleList & parents)
1039 {
1040 //char buf [255];
1041
1042 // use cached node scores if possible
1043 if (scores_cached) {
1044 _List * scores = (_List *) node_scores.list_data[node_id];
1045
1046 if (parents.lLength == 0) {
1047 _Constant * orphan_score = (_Constant *) scores->list_data[0];
1048 return (hyFloat) orphan_score->Value();
1049 } else if (parents.lLength == 1) {
1050 _Matrix * single_parent_scores = (_Matrix *) scores->list_data[1];
1051 return (hyFloat) (*single_parent_scores) (parents.list_data[0], 0);
1052 } else {
1053 _NTupleStorage * family_scores = (_NTupleStorage *) scores->list_data[parents.lLength];
1054 _SimpleList nktuple;
1055
1056 for (long i = 0; i < parents.lLength; i++) { // map parents back to nk-tuple
1057 long par = parents.list_data[i];
1058 if (par > node_id) {
1059 par--;
1060 }
1061 nktuple << par;
1062 }
1063 return (hyFloat) family_scores->Retrieve (nktuple); // using nk-tuple
1064 }
1065 }
1066
1067
1068
1069 #ifdef __MISSING_DATA__
1070 // Are any of the edges banned?
1071 for (long par = 0; par < parents.lLength; par++) {
1072 if (banned_edges(parents.list_data[par], node_id) > 0) {
1073 // score should never be used
1074 ReportWarning(_String("Skipping node score for family containing banned edge ") & parents.list_data[par] & "->" & node_id & "\n");
1075 return (-INFINITY);
1076 }
1077 }
1078
1079
1080 // Is node with missing data in Markov blanket of focal node?
1081 if (has_missing.list_data[node_id]) {
1082 //return (ImputeDiscreteScore (node_id, parents));
1083 return (GibbsApproximateDiscreteScore (node_id, parents));
1084 } else {
1085 for (long par = 0; par < parents.lLength; par++) {
1086 if (has_missing.list_data[parents.list_data[par]]) {
1087 // return (ImputeDiscreteScore (node_id, parents));
1088 return (GibbsApproximateDiscreteScore (node_id, parents));
1089 }
1090 }
1091 }
1092 #endif
1093
1094
1095 _SimpleList multipliers ((long)1);
1096
1097 // else need to compute de novo
1098 _Matrix n_ijk,
1099 n_ij; // [i] indexes child nodes,
1100 // [j] indexes combinations of values for parents of i-th node,
1101 // [k] indexes values of i-th node
1102
1103 long num_parent_combos = 1, // i.e. 'q'
1104 r_i = num_levels.list_data[node_id];
1105
1106 hyFloat n_prior_ijk = 0,
1107 n_prior_ij = 0,
1108 log_score = 0;
1109
1110
1111
1112 // how many combinations of parental states are there?
1113 for (long par = 0; par < parents.lLength; par++) {
1114 num_parent_combos *= num_levels.list_data[parents.list_data[par]];
1115 multipliers << num_parent_combos;
1116 }
1117
1118
1119 #ifdef BGM_DEBUG_CDS
1120 snprintf (buf, sizeof(buf), "Multipliers: ");
1121 BufferToConsole (buf);
1122 for (long i = 0; i < multipliers.lLength; i++) {
1123 snprintf (buf, sizeof(buf), "%d ", multipliers.list_data[i]);
1124 BufferToConsole (buf);
1125 }
1126 NLToConsole();
1127 #endif
1128
1129
1130 /* count observations by parent combination, using direct indexing */
1131 CreateMatrix (&n_ijk, num_parent_combos, r_i, false, true, false);
1132 CreateMatrix (&n_ij, num_parent_combos, 1, false, true, false);
1133
1134
1135 #ifdef __MISSING_DATA__
1136 /* METHODS FOR COMPLETE DATA */
1137 for (long obs = 0; obs < obsData->GetHDim(); obs++) {
1138 long index = 0,
1139 //multiplier = 1,
1140 child_state = (*obsData)(obs, node_id);
1141
1142 for (long par = 0; par < parents.lLength; par++) {
1143 long this_parent = parents.list_data[par],
1144 this_parent_state = (*obsData)(obs, this_parent);
1145
1146 index += this_parent_state * multipliers.list_data[par];
1147 }
1148
1149 n_ijk.Store ((long) index, child_state, n_ijk(index, child_state) + 1 );
1150 n_ij.Store ((long) index, 0, n_ij(index, 0) + 1 );
1151 }
1152
1153
1154 #ifdef BGM_DEBUG_CDS
1155 snprintf (buf, sizeof(buf), "Node %d, parent(s) ", node_id);
1156 BufferToConsole (buf);
1157
1158 for (long k = 0; k < parents.lLength; k++) {
1159 snprintf (buf, sizeof(buf), "%d ", parents.list_data[k]);
1160 BufferToConsole (buf);
1161 }
1162 NLToConsole();
1163
1164 for (long j = 0; j < num_parent_combos; j++) {
1165 snprintf (buf, sizeof(buf), "N(%d,%d) = %f\n", node_id, j, n_ij(j,0));
1166 BufferToConsole (buf);
1167
1168 for (long k = 0; k < r_i; k++) {
1169 snprintf (buf, sizeof(buf), "N(%d,%d,%d) = %f\n", node_id, j, k, n_ijk(j,k));
1170 BufferToConsole (buf);
1171 }
1172 }
1173 #endif
1174
1175
1176 if (prior_sample_size (node_id, 0) == 0) { // K2
1177 for (long j = 0; j < num_parent_combos; j++) {
1178 log_score += LnGamma(num_levels.list_data[node_id]); // (r-1)!
1179 log_score -= LnGamma(n_ij(j, 0) + num_levels.list_data[node_id]); // (N+r-1)!
1180
1181 for (long k = 0; k < r_i; k++) {
1182 log_score += LnGamma (n_ijk(j,k) + 1); // (N_ijk)!
1183 }
1184
1185 #ifdef BGM_DEBUG_CDS
1186 snprintf (buf, sizeof(buf), "\tlog(r-1)! = log %d! = %lf\n", num_levels.list_data[node_id] - 1, LnGamma(num_levels.list_data[node_id]));
1187 BufferToConsole (buf);
1188
1189 snprintf (buf, sizeof(buf), "\tlog(N(%d,%d)+r-1)! = log %d! = %lf\n", node_id, j, ((long)n_ij(j, 0)) + r_i - 1,
1190 LnGamma(n_ij(j, 0) + num_levels.list_data[node_id]));
1191 BufferToConsole (buf);
1192
1193 for (long k = 0; k < r_i; k++) {
1194 snprintf (buf, sizeof(buf), "\tlog (N(%d,%d,%d)!) = log %d! = %lf\n", node_id, j, k, ((long)n_ijk(j,k)), LnGamma(n_ijk(j,k) + 1));
1195 BufferToConsole (buf);
1196 }
1197
1198 snprintf (buf, sizeof(buf), "j = %d\tcumulative log score = %lf\n", j, log_score);
1199 BufferToConsole (buf);
1200 #endif
1201 }
1202 } else { // BDe
1203 n_prior_ij = prior_sample_size (node_id, 0) / num_parent_combos;
1204 n_prior_ijk = n_prior_ij / num_levels.list_data[node_id];
1205
1206 for (long j = 0; j < num_parent_combos; j++) {
1207 log_score += LnGamma(n_prior_ij) - LnGamma(n_prior_ij + n_ij(j,0));
1208
1209 for (long k = 0; k < num_levels.list_data[node_id]; k++) {
1210 log_score += LnGamma(n_prior_ijk + n_ijk(j,k)) - LnGamma(n_prior_ijk);
1211 }
1212 }
1213 }
1214
1215 #else
1216 for (long obs = 0; obs < obsData->GetHDim(); obs++) {
1217 long index = 0,
1218 //multiplier = 1,
1219 child_state = (*obsData)(obs, node_id);
1220
1221 for (long par = 0; par < parents.lLength; par++) {
1222 long this_parent = parents.list_data[par],
1223 this_parent_state = (*obsData)(obs, this_parent);
1224
1225
1226 /* // this system doesn't work with unequal levels! :-P afyp March 31, 2008
1227 index += this_parent_state * multiplier;
1228 multiplier *= num_levels.list_data[this_parent];
1229 */
1230 index += this_parent_state * multipliers.list_data[par];
1231 }
1232
1233 n_ijk.Store ((long) index, child_state, n_ijk(index, child_state) + 1);
1234 n_ij.Store ((long) index, 0, n_ij(index, 0) + 1);
1235
1236 }
1237
1238
1239
1240 /* compute scoring metric */
1241 if (prior_sample_size (node_id, 0) == 0) {
1242 /* assume no prior information, use K2 metric */
1243 for (long j = 0; j < num_parent_combos; j++) {
1244 log_score += LnGamma(r_i); // (r-1)!
1245 log_score -= LnGamma(n_ij(j, 0) + r_i); // (N+r-1)!
1246
1247 for (long k = 0; k < num_levels.list_data[node_id]; k++) {
1248 log_score += LnGamma(n_ijk(j,k) + 1); // (N_ijk)!
1249 }
1250
1251
1252
1253 snprintf (buf, sizeof(buf), "Node %d, parent(s) ", node_id);
1254 BufferToConsole (buf);
1255
1256 for (long k = 0; k < parents.lLength; k++) {
1257 snprintf (buf, sizeof(buf), "%d ", parents.list_data[k]);
1258 BufferToConsole (buf);
1259 }
1260 NLToConsole();
1261
1262 snprintf (buf, sizeof(buf), "j = %d\tcumulative log score = %lf\n", j, log_score);
1263 BufferToConsole (buf);
1264
1265 #ifdef BGM_DEBUG_CDS
1266 snprintf (buf, sizeof(buf), "\tlog(r-1)! = log %d! = %lf\n", num_levels.list_data[node_id] - 1, LnGamma(num_levels.list_data[node_id]));
1267 BufferToConsole (buf);
1268
1269 snprintf (buf, sizeof(buf), "\tlog(N+r-1)! = log %d! = %lf\n", n_ij(j, 0) + num_levels.list_data[node_id] - 1,
1270 LnGamma(n_ij(j, 0) + num_levels.list_data[node_id]));
1271 BufferToConsole (buf);
1272
1273 for (long k = 0; k < num_levels.list_data[node_id]; k++) {
1274 snprintf (buf, sizeof(buf), "\tlog (N_ijk)! = log %d! = %lf\n", ((long)n_ijk(j,k)), LnGamma(n_ijk(j,k) + 1));
1275 BufferToConsole (buf);
1276 }
1277
1278
1279 #endif
1280 }
1281 } else {
1282 /* calculate Bayesian Dirichlet metric (BDeu) for this node */
1283 /* see p212 in Heckerman, Geiger, and Chickering (1995) Machine Learning 20, 197-243 */
1284 n_prior_ij = prior_sample_size (node_id, 0) / num_parent_combos;
1285 n_prior_ijk = n_prior_ij / num_levels.list_data[node_id];
1286
1287 for (long j = 0; j < num_parent_combos; j++) {
1288 log_score += LnGamma(n_prior_ij) - LnGamma(n_prior_ij + n_ij(j,0));
1289
1290 for (long k = 0; k < num_levels.list_data[node_id]; k++) {
1291 log_score += LnGamma(n_prior_ijk + n_ijk(j,k)) - LnGamma(n_prior_ijk);
1292 }
1293 }
1294 }
1295 #endif
1296
1297 /*
1298 snprintf (buf, sizeof(buf), "Node %d, parents (%d): ", node_id, parents.lLength);
1299 BufferToConsole (buf);
1300
1301 for (long par = 0; par < parents.lLength; par++)
1302 {
1303 snprintf (buf, sizeof(buf), " %d", parents.list_data[par]);
1304 BufferToConsole (buf);
1305 }
1306 snprintf (buf, sizeof(buf), " Log score = %f\n", log_score);
1307 BufferToConsole (buf);
1308 */
1309
1310 return (log_score);
1311 }
1312
1313
1314
1315
1316 //___________________________________________________________________________________________
1317 //#define __DEBUG_CNS__
CacheNodeScores(void)1318 void Bgm::CacheNodeScores (void)
1319 {
1320 if (scores_cached) {
1321 return;
1322 }
1323
1324 /*
1325 char buf [255];
1326 snprintf (buf, sizeof(buf), "\nCaching node scores...\n");
1327 BufferToConsole (buf);
1328 */
1329
1330
1331 #if defined __HYPHYMPI_ND__
1332 hyFloat use_mpi_caching;
1333 checkParameter (useMPIcaching, use_mpi_caching, 0);
1334
1335 if (use_mpi_caching) {
1336
1337 // MPI_Init() is called in main()
1338 int size,
1339 rank;
1340
1341 long mpi_node;
1342
1343 _Matrix single_parent_scores (num_nodes, 1, false, true);
1344
1345 _SimpleList parents,
1346 all_but_one (num_nodes-1, 0, 1),
1347 aux_list,
1348 nk_tuple;
1349
1350 hyFloat score;
1351
1352 char mpi_message [256];
1353
1354 MPI_Status status; // contains source, tag, and error code
1355
1356 MPI_Comm_size (MPI_COMM_WORLD, &size); // number of processes
1357 MPI_Comm_rank (MPI_COMM_WORLD, &rank);
1358
1359
1360 if (rank == 0) {
1361 _String bgmSwitch ("_BGM_SWITCH_"),
1362 bgmStr;
1363
1364 _List * this_list;
1365
1366 SerializeBgm (bgmStr);
1367 #ifdef __DEBUG_CNS__
1368 ReportWarning (bgmStr);
1369 #endif
1370 _Matrix * mpi_node_status = new _Matrix ((long)size, 2, false, true);
1371 long senderID;
1372
1373
1374
1375 // switch compute nodes from mpiNormal to bgm loop context
1376 for (long ni = 1; ni < size; ni++) {
1377 MPISendString (bgmSwitch, ni);
1378 }
1379
1380
1381
1382 // receive confirmation of successful switch
1383 for (long ni = 1; ni < size; ni++) {
1384 long fromNode = ni;
1385
1386 _String t (MPIRecvString (ni,fromNode));
1387 if (!t.Equal (&bgmSwitch)) {
1388 WarnError (_String("Failed to confirm MPI mode switch at node ") & ni);
1389 return;
1390 } else {
1391 ReportWarning (_String("Successful mode switch to Bgm MPI confirmed from node ") & ni);
1392 MPISendString (bgmStr, ni);
1393 }
1394 }
1395
1396
1397 long node_id;
1398
1399 // farm out jobs to idle nodes until none are left
1400 for ( node_id = 0; node_id < num_nodes; node_id++) {
1401 long maxp = max_parents.list_data[node_id],
1402 ntuple_receipt,
1403 this_node;
1404
1405 bool remaining;
1406
1407 _String mxString,
1408 mxName;
1409
1410
1411 this_list = (_List *) node_scores.list_data[node_id];
1412
1413 // [_SimpleList parents] should always be empty here
1414 hyFloat score = is_discrete.list_data[node_id] ? ComputeDiscreteScore (node_id, parents) : ComputeContinuousScore (node_id);
1415 _Constant orphan_score (score);
1416
1417
1418 this_list->Clear();
1419 (*this_list) && (&orphan_score); // handle orphan score locally
1420
1421
1422 if (maxp > 0) { // don't bother to farm out trivial cases
1423 // look for idle nodes
1424 mpi_node = 1;
1425 do {
1426 if ((*mpi_node_status)(mpi_node, 0) == 0) {
1427 ReportMPIError(MPI_Send(&node_id, 1, MPI_LONG, mpi_node, HYPHY_MPI_VARS_TAG, MPI_COMM_WORLD), true);
1428 ReportWarning (_String ("Sent child ") & node_id & " to node " & mpi_node);
1429
1430 mpi_node_status->Store (mpi_node, 0, 1); // set busy signal
1431 mpi_node_status->Store (mpi_node, 1, node_id);
1432
1433 break;
1434 }
1435 mpi_node++;
1436 } while (mpi_node < size);
1437 }
1438
1439
1440 if (mpi_node == size) { // all nodes are busy, wait for one to send results
1441 MPIReceiveScores (mpi_node_status, true, node_id);
1442 }
1443 }
1444 // end loop over child nodes
1445
1446
1447 // collect remaining jobs
1448 while (1) {
1449 // look for a busy node
1450 mpi_node = 1;
1451 do {
1452 if ( (*mpi_node_status)(mpi_node,0) == 1) {
1453 break;
1454 }
1455 mpi_node++;
1456 } while (mpi_node < size);
1457
1458
1459 if (mpi_node < size) { // at least one node is busy
1460 MPIReceiveScores (mpi_node_status, false, 0);
1461 } else {
1462 break;
1463 }
1464 }
1465
1466
1467 // shut down compute nodes
1468 for (long shutdown = -1, mpi_node = 1; mpi_node < size; mpi_node++) {
1469 ReportMPIError(MPI_Send(&shutdown, 1, MPI_LONG, mpi_node, HYPHY_MPI_VARS_TAG, MPI_COMM_WORLD), true);
1470 ReportWarning (_String ("Node 0 sending shutdown signal to node ") & mpi_node);
1471 }
1472
1473 } else { // compute node
1474 long node_id,
1475 maxp;
1476
1477 _List list_of_matrices;
1478
1479 while (1) {
1480 _String mxString,
1481 mxName;
1482
1483 list_of_matrices.Clear();
1484
1485
1486 // wait for master node to issue node ID to compute
1487 ReportMPIError (MPI_Recv (&node_id, 1, MPI_LONG, 0, HYPHY_MPI_VARS_TAG, MPI_COMM_WORLD, &status), false);
1488 ReportWarning (_String("Node ") & (long)rank & " received child " & (long)node_id & " from node " & (long)status.MPI_SOURCE & "\n");
1489
1490 if (node_id < 0) {
1491 ReportWarning (_String("Node") & (long)rank & " recognizes shutdown signal.\n");
1492 break; // received shutdown message (-1)
1493 }
1494
1495
1496 maxp = max_parents.list_data[node_id];
1497
1498 parents.Clear();
1499 parents.Populate (1,0,0);
1500
1501
1502 // compute single parent scores
1503 for (long par = 0; par < num_nodes; par++) {
1504 if (par == node_id) { // child cannot be its own parent, except in Kansas
1505 single_parent_scores.Store (par, 0, 0.);
1506 } else {
1507 parents.list_data[0] = par;
1508 single_parent_scores.Store (par, 0, is_discrete.list_data[node_id] ? ComputeDiscreteScore (node_id, parents) :
1509 ComputeContinuousScore (node_id));
1510 }
1511 }
1512
1513
1514 // compute multiple parents cores
1515 for (long np = 2; np <= maxp; np++) {
1516 parents.Clear();
1517 parents.Populate (np, 0, 0);
1518
1519 if (all_but_one.NChooseKInit (aux_list, nk_tuple, np, false)) {
1520 bool remaining;
1521 long tuple_index = 0,
1522 num_ktuples = exp(LnGamma(num_nodes) - LnGamma(num_nodes - np) - LnGamma(np+1));
1523
1524
1525 _Matrix tuple_scores (num_ktuples, 1, false, true);
1526
1527 for (long tuple_index = 0; tuple_index < num_ktuples; tuple_index++) {
1528 remaining = all_but_one.NChooseK (aux_list, nk_tuple);
1529
1530 if (!remaining && tuple_index < num_ktuples-1) {
1531 ReportWarning (_String ("ERROR: Ran out of (n,k)tuples in CacheNodeScores()."));
1532 }
1533
1534 for (long par_idx = 0; par_idx < np; par_idx++) {
1535 long par = nk_tuple.list_data[par_idx];
1536 if (par >= node_id) {
1537 par++;
1538 }
1539 parents.list_data[par_idx] = par;
1540 }
1541
1542 score = is_discrete.list_data[node_id] ? ComputeDiscreteScore (node_id, parents) :
1543 ComputeContinuousScore (node_id);
1544
1545 tuple_scores.Store (tuple_index, 0, (double)score);
1546 }
1547
1548 if (remaining) {
1549 ReportWarning (_String ("ERROR: Did not compute all nk-tuples in CacheNodeScores()"));
1550 }
1551
1552 list_of_matrices && (&tuple_scores); // append duplicate
1553 } else {
1554 ReportWarning (_String ("Failed to initialize _NTupleStorage object in Bgm::CacheNodeScores()"));
1555 }
1556
1557 }
1558
1559 // send results to master node
1560
1561 ReportMPIError (MPI_Send (single_parent_scores.theData, num_nodes, MPI_DOUBLE, 0, HYPHY_MPI_DATA_TAG, MPI_COMM_WORLD), true);
1562
1563
1564 for (long np = 2; np <= maxp; np++) {
1565 _Matrix * storedMx = (_Matrix *) list_of_matrices.list_data[np-2];
1566 ReportMPIError (MPI_Send (storedMx->theData, storedMx->GetHDim(), MPI_DOUBLE, 0, HYPHY_MPI_DATA_TAG, MPI_COMM_WORLD), true);
1567 }
1568 }
1569 }
1570 } else { // perform single-threaded
1571 #else
1572
1573
1574 #if !defined __UNIX__ || defined __HEADLESS__
1575 TimerDifferenceFunction(false); // save initial timer; will only update every 1 second
1576 #if !defined __HEADLESS__
1577 SetStatusLine (empty,_HYBgm_STATUS_LINE_CACHE, empty, 0, HY_SL_TASK|HY_SL_PERCENT);
1578 #else
1579 SetStatusLine (_HYBgm_STATUS_LINE_CACHE);
1580 #endif
1581 hyFloat seconds_accumulator = .0,
1582 temp;
1583 #endif
1584
1585 for (long node_id = 0; node_id < num_nodes; node_id++) {
1586 long maxp = max_parents.list_data[node_id];
1587 _List * this_list = (_List *) node_scores.list_data[node_id]; // retrieve pointer to list of scores for this child node
1588
1589 this_list->Clear(); // reset the list
1590
1591 // prepare some containers
1592 _SimpleList parents;
1593 hyFloat score = is_discrete.list_data[node_id] ? ComputeDiscreteScore (node_id, parents) : ComputeContinuousScore (node_id);
1594 _Constant orphan_score (score);
1595
1596 (*this_list) && (&orphan_score);
1597
1598 #if !defined __UNIX__ || defined __HEADLESS__
1599 temp = .0;
1600 #endif
1601
1602 if (maxp > 0) {
1603 _Matrix single_parent_scores (num_nodes, 1, false, true);
1604 for (long par = 0; par < num_nodes; par++) {
1605 if (par == node_id) { // child cannot be its own parent, except in Kansas
1606 continue;
1607 }
1608
1609 parents << par;
1610 single_parent_scores.Store (par, 0, is_discrete.list_data[node_id] ? ComputeDiscreteScore (node_id, parents) :
1611 ComputeContinuousScore (node_id));
1612 parents.Clear();
1613 }
1614 (*this_list) && (&single_parent_scores);
1615 }
1616
1617 if (maxp > 1) {
1618 _SimpleList all_but_one (num_nodes-1, 0, 1), // 0, 1, 2, ... , n-1
1619 aux_list,
1620 nk_tuple;
1621
1622 for (long np = 2; np <= maxp; np++) {
1623 _NTupleStorage family_scores (num_nodes-1, np);
1624
1625 if (all_but_one.NChooseKInit (aux_list, nk_tuple, np, false)) {
1626 bool remaining;
1627 long res;
1628 do {
1629 remaining = all_but_one.NChooseK (aux_list, nk_tuple);
1630 for (long par_idx = 0; par_idx < np; par_idx++) {
1631 long par = nk_tuple.list_data[par_idx];
1632 if (par >= node_id) {
1633 par++;
1634 }
1635 parents << par;
1636 }
1637 score = is_discrete.list_data[node_id] ? ComputeDiscreteScore (node_id, parents) :
1638 ComputeContinuousScore (node_id);
1639 res = family_scores.Store (score, nk_tuple);
1640 parents.Clear();
1641 } while (remaining);
1642 } else {
1643 _String oops ("Failed to initialize _NTupleStorage object in Bgm::CacheNodeScores().\n");
1644 WarnError(oops);
1645 }
1646 (*this_list) && (&family_scores); // append duplicate to storage
1647 }
1648 }
1649
1650 #if !defined __UNIX__ || defined __HEADLESS__
1651 if ((temp=TimerDifferenceFunction(true))>1.0) { // time to update
1652 seconds_accumulator += temp;
1653
1654 _String statusLine = _HYBgm_STATUS_LINE_CACHE & " " & (node_id+1) & "/" & num_nodes
1655 & " nodes (" & (1.0+node_id)/seconds_accumulator & "/second)";
1656
1657 #if defined __HEADLESS__
1658 SetStatusLine (statusLine);
1659 #else
1660 SetStatusLine (empty,statusLine,empty,100*(float)node_id/(num_nodes),HY_SL_TASK|HY_SL_PERCENT);
1661 #endif
1662 TimerDifferenceFunction (false); // reset timer for the next second
1663 yieldCPUTime (); // let the GUI handle user actions
1664
1665 if (terminate_execution) { // user wants to cancel the analysis
1666 break;
1667 }
1668 }
1669 #endif
1670
1671 } // end for loop over nodes
1672 #endif
1673
1674
1675 #if defined __HYPHYMPI_ND__
1676 }
1677 #endif
1678
1679 #if !defined __UNIX__ || defined __HEADLESS__
1680 SetStatusLine (_HYBgm_STATUS_LINE_CACHE_DONE);
1681 #endif
1682
1683 scores_cached = TRUE;
1684
1685
1686 }
1687
1688
1689
1690
1691 //___________________________________________________________________________________________
1692 #if defined __HYPHYMPI__
MPIReceiveScores(_Matrix * mpi_node_status,bool sendNextJob,long node_id)1693 void Bgm::MPIReceiveScores (_Matrix * mpi_node_status, bool sendNextJob, long node_id)
1694 {
1695 _Matrix single_parent_scores (num_nodes, 1, false, true);
1696 MPI_Status status;
1697
1698 ReportMPIError (MPI_Recv (single_parent_scores.theData, num_nodes, MPI_DOUBLE, MPI_ANY_SOURCE, HYPHY_MPI_DATA_TAG, MPI_COMM_WORLD, &status), false);
1699
1700
1701 long senderID = (long) status.MPI_SOURCE,
1702 this_node = (long) (*mpi_node_status) (senderID, 1),
1703 maxp = max_parents.list_data[this_node];
1704
1705 _List * this_list = (_List *) node_scores.list_data[this_node];
1706
1707
1708 mpi_node_status->Store (senderID, 0, 0); // set node status to idle
1709
1710
1711 _String mxString,
1712 mxName;
1713
1714 ReportWarning (_String("Received scores for child ") & this_node & " from node " & senderID);
1715 #ifdef __DEBUG_MPIRS__
1716 mxName = _String ("single_parent_scores");
1717 single_parent_scores.Serialize (mxString, mxName);
1718 ReportWarning (mxString);
1719 #endif
1720 (*this_list) && (&single_parent_scores);
1721
1722
1723
1724 _SimpleList parents,
1725 all_but_one (num_nodes-1, 0, 1),
1726 aux_list,
1727 nk_tuple;
1728
1729 hyFloat score;
1730
1731 // parse nk-tuple results
1732 for (long np = 2; np <= maxp; np++) {
1733 _NTupleStorage family_scores (num_nodes-1, np);
1734 bool remaining;
1735
1736
1737 if (all_but_one.NChooseKInit (aux_list, nk_tuple, np, false)) {
1738 long score_index = 0,
1739 num_ktuples = exp(LnGamma(num_nodes) - LnGamma(num_nodes - np) - LnGamma(np+1)),
1740 ntuple_receipt;
1741
1742 _Matrix scores_to_store (num_ktuples, 1, false, true);
1743
1744 // receive nk-tuple indexed node scores from same compute node
1745 ReportMPIError (MPI_Recv (scores_to_store.theData, num_ktuples, MPI_DOUBLE, senderID, HYPHY_MPI_DATA_TAG, MPI_COMM_WORLD, &status), false);
1746 #ifdef __DEBUG_MPIRS__
1747 mxName = _String("tuple_scores") & np;
1748 mxString.Initialize();
1749 scores_to_store.Serialize (mxString, mxName);
1750 ReportWarning (mxString);
1751 #endif
1752
1753 do {
1754 remaining = all_but_one.NChooseK (aux_list, nk_tuple); // update nk-tuple in aux_list
1755 ntuple_receipt = family_scores.Store (scores_to_store(score_index, 0), nk_tuple);
1756 score_index++;
1757 } while (remaining);
1758 } else {
1759 _String oops ("Failed to initialize _NTupleStorage object in Bgm::CacheNodeScores().\n");
1760 WarnError(oops);
1761 }
1762
1763 (*this_list) && (&family_scores);
1764 }
1765
1766
1767 // send next job
1768 if (sendNextJob) {
1769 ReportMPIError(MPI_Send(&node_id, 1, MPI_LONG, senderID, HYPHY_MPI_VARS_TAG, MPI_COMM_WORLD), true);
1770 mpi_node_status->Store (senderID, 0, 1); // reset busy signal
1771 mpi_node_status->Store (senderID, 1, node_id);
1772 ReportWarning (_String ("Sent child ") & node_id & " to node " & senderID);
1773 }
1774 }
1775 #endif
1776
1777
1778
1779 //___________________________________________________________________________________________
ExportGraph(void)1780 _Matrix * Bgm::ExportGraph (void)
1781 {
1782 _Matrix * export_graph = new _Matrix ((_Matrix &)dag); // duplicator
1783
1784 return (_Matrix *) (export_graph->makeDynamic());
1785 }
1786
1787
1788
1789
1790 //___________________________________________________________________________________________
1791 // Allocate memory for storing node scores and edge posteriors
1792
InitComputeLists(_List * compute_list)1793 void Bgm::InitComputeLists (_List * compute_list)
1794 {
1795 // adaptive storage for floating point numbers
1796 _GrowingVector * newstore;
1797 checkPointer (newstore = new _GrowingVector);
1798
1799 for (long i = 0; i < num_nodes * num_nodes; i++) {
1800 (*compute_list) && newstore;
1801 }
1802
1803 DeleteObject (newstore);
1804 }
1805
1806
1807
1808 //___________________________________________________________________________________________
1809 // Free up memory allocated to cached node scores and edge posteriors
1810
DumpComputeLists(_List * compute_list)1811 void Bgm::DumpComputeLists (_List * compute_list)
1812 {
1813 for (long i = 0; i < compute_list->lLength; i++) {
1814 ((_GrowingVector *) compute_list->list_data[i]) -> Clear();
1815 }
1816
1817 compute_list->Clear();
1818 }
1819
1820
1821
1822 //___________________________________________________________________________________________
Compute(_SimpleList * node_order,_List * results)1823 hyFloat Bgm::Compute (_SimpleList * node_order, _List * results)
1824 {
1825 /* Calculate equation (8) from Friedman and Koller (2003), i.e. joint probability
1826 of data by summing all families at i-th node that are consistent with node order,
1827 then taking product across all nodes in the network. Traverse using AVL indexing.
1828
1829 Node order is a vector of length [num_nodes] containing rank of each node, i.e.
1830 a position in an ordered sequence. In addition, compute marginal posteriors of
1831 each potential edge in the graph, by Proposition 3.2. */
1832
1833
1834
1835 hyFloat log_likel = 0.;
1836 _GrowingVector *gv1, *gv2;
1837
1838
1839 // reset _GrowingVector objects stored in _List object
1840 for (long i = 0; i < num_nodes * num_nodes; i++) {
1841 gv1 = (_GrowingVector *) results->list_data[i];
1842 gv1 -> ZeroUsed();
1843 }
1844
1845
1846 for (long nodeIndex = 0; nodeIndex < node_order->lLength; nodeIndex++) {
1847 long child_node = node_order->list_data[nodeIndex],
1848 maxp = max_parents.list_data[child_node];
1849
1850 _List * score_lists = (_List *) node_scores.list_data[child_node];
1851 _Constant * orphan_score = (_Constant *) (score_lists->list_data[0]);
1852
1853
1854 gv1 = (_GrowingVector *) results->list_data[child_node * num_nodes + child_node]; // store denominator in diagonal
1855 gv1->ZeroUsed();
1856 gv1 -> Store (orphan_score->Value()); // handle case of no parents
1857
1858
1859
1860 if (maxp > 0) {
1861 // all nodes to the right are potential parents, except banned parents!
1862 _SimpleList precedes;
1863 for (long parIndex = nodeIndex + 1; parIndex < node_order->lLength; parIndex++) {
1864 long par = node_order->list_data[parIndex];
1865
1866 if (banned_edges(par, child_node) == 0) {
1867 precedes << par;
1868 }
1869 }
1870
1871
1872 // handle trivial case of one parent
1873 _Matrix * single_parent_scores = (_Matrix *) (score_lists->list_data[1]);
1874
1875 for (long i = 0; i < precedes.lLength; i++) {
1876 long par = precedes.list_data[i];
1877
1878 gv1 -> Store ((*single_parent_scores) (par, 0));
1879 gv2 = (_GrowingVector *) results->list_data[child_node * num_nodes + par];
1880 gv2 -> Store ((*single_parent_scores) (par, 0));
1881 }
1882
1883
1884 // more than one parent requires k-tuples
1885 if (maxp > 1) {
1886 _SimpleList indices (precedes.lLength, 0, 1); // populates list with 0, 1, 2, ..., M-1
1887 // where M is the number of eligible parents in [precedes]
1888 _NTupleStorage * family_scores;
1889
1890 for (long nparents = 2; nparents <= maxp; nparents++) {
1891 _SimpleList subset,
1892 auxil;
1893
1894 bool not_finished;
1895
1896
1897 if (nparents > precedes.lLength) { // not enough eligible parents to form tuples!
1898 break;
1899 }
1900
1901
1902 if (indices.NChooseKInit (auxil, subset, nparents, false)) {
1903 hyFloat tuple_score;
1904 _SimpleList parents;
1905
1906 parents.Populate (nparents, 0, 0); // allocate memory
1907
1908 family_scores = (_NTupleStorage *) (score_lists->list_data[nparents]);
1909
1910
1911 do {
1912 //parents.Clear();
1913 not_finished = indices.NChooseK (auxil, subset); // cycle through index combinations
1914
1915
1916 for (long i = 0; i < nparents; i++) { // convert indices to parent IDs (skipping child)
1917 long realized = precedes.list_data[subset.list_data[i]];
1918 if (realized >= child_node) {
1919 realized--;
1920 }
1921 parents.list_data[i] = realized;
1922 }
1923 parents.Sort(TRUE);
1924
1925 tuple_score = family_scores -> Retrieve (parents);
1926
1927 gv1 -> Store (tuple_score);
1928
1929 for (long i = 0; i < nparents; i++) {
1930 gv2 = (_GrowingVector *) results->list_data[child_node * num_nodes + precedes.list_data[subset.list_data[i]]];
1931 gv2 -> Store (tuple_score);
1932 }
1933 } while (not_finished);
1934 }
1935 }
1936 }
1937 }
1938
1939 gv1 -> _Matrix::Store (0, 0, LogSumExpo(gv1)); // replace first entry with sum, i.e. marginal log-likelihood of child node
1940 log_likel += (*gv1)(0, 0);
1941
1942 }
1943 // end loop over child nodes
1944
1945 return log_likel;
1946 }
1947
1948
1949
1950 //___________________________________________________________________________________________
Compute(void)1951 hyFloat Bgm::Compute (void)
1952 {
1953 //CacheNodeScores();
1954
1955 // return posterior probability of a given network defined by 'dag' matrix
1956 hyFloat log_score = 0.;
1957
1958 for (long node_id = 0; node_id < num_nodes; node_id++) {
1959 log_score += is_discrete.list_data[node_id] ? ComputeDiscreteScore (node_id) : ComputeContinuousScore (node_id);
1960 }
1961
1962 return log_score;
1963 }
1964
1965
1966
1967 //___________________________________________________________________________________________
Compute(_Matrix * g)1968 hyFloat Bgm::Compute (_Matrix * g)
1969 {
1970 //CacheNodeScores();
1971
1972 // return posterior probability of a given network defined by 'dag' matrix
1973 hyFloat log_score = 0.;
1974
1975 for (long node_id = 0; node_id < num_nodes; node_id++) {
1976 log_score += is_discrete.list_data[node_id] ? ComputeDiscreteScore (node_id, g) : ComputeContinuousScore (node_id, g);
1977 }
1978
1979 return log_score;
1980 }
1981
1982
1983
1984
1985 //___________________________________________________________________________________________
Optimize(_AssociativeList const * options)1986 _Matrix * Bgm::Optimize (_AssociativeList const * options) {
1987 #ifdef DEBUG_OPTIMIZE
1988 char buf [255];
1989 #endif
1990
1991 if (!scores_cached) {
1992 CacheNodeScores();
1993 }
1994
1995
1996 hyFloat num_restarts, // HBL settings
1997 num_randomize;
1998
1999 checkParameter (maxNumRestart, num_restarts, 1.);
2000 checkParameter (numRandomize, num_randomize, num_nodes);
2001 //checkParameter (useNodeOrder, use_node_order, 0);
2002
2003
2004
2005 // char bug [255];
2006 hyFloat optMethod; /* 0 = K2 fixed order; 1 = K2 shuffle order with restarts */
2007 /* 2 = MCMC fixed order; 3 = MCMC over networks and orders */
2008 checkParameter (bgmOptimizationMethod, optMethod, 0.);
2009
2010 if (optMethod > 1) {
2011 return GraphMCMC( (optMethod < 3) ? TRUE : FALSE); // use MCMC to evaluate posterior probability of network space
2012 }
2013
2014
2015
2016 hyFloat this_score, best_score, next_score;
2017
2018 _Matrix orderMx (num_nodes, num_nodes, false, true),
2019 best_dag (num_nodes, num_nodes, false, true);
2020
2021 bool reshuffle_order = FALSE;
2022
2023
2024 // if best node order hasn't been estimated,
2025 if (optMethod == 1 && best_node_order.lLength == 0) {
2026 reshuffle_order = TRUE;
2027 best_node_order.Populate (num_nodes, 0, 1);
2028 best_node_order.Permute (1);
2029 }
2030
2031
2032 // Convert node order to binary matrix where edge A->B is permitted if
2033 // orderMx[B][A] = 1, i.e. B is to the right of A in node order.
2034 for (long i = 0; i < num_nodes; i++) {
2035 long child = best_node_order.list_data[i];
2036
2037 for (long j = 0; j < num_nodes; j++) {
2038 long parent = best_node_order.list_data[j];
2039
2040 orderMx.Store (parent, child, (j > i) ? 1 : 0);
2041 }
2042 }
2043
2044
2045 // greedy hill-climbing algorithm (K2)
2046 ResetGraph (nil);
2047 best_score = Compute(); // best over all node orders (if we're reshuffling)
2048
2049
2050 for (long iter = 0; iter < num_restarts; iter++) {
2051 next_score = Compute(); // reset to empty graph score
2052
2053
2054 for (long child = 0; child < num_nodes; child++) {
2055 long num_parents = 0,
2056 improvement_flag = 0,
2057 next_parent_to_add;
2058
2059 do {
2060 for (long parent = 0; parent < num_nodes; parent++) {
2061 if ( (parent != child) // must meet all conditions!
2062 && (dag(parent, child) == 0)
2063 && (orderMx (parent, child) == 1)
2064 && banned_edges(parent, child) == 0) {
2065 dag.Store (parent, child, 1);
2066 this_score = Compute();
2067
2068 if (this_score > next_score) {
2069 improvement_flag = 1;
2070 next_score = this_score;
2071 next_parent_to_add = parent;
2072 }
2073 dag.Store (parent, child, 0); // revert
2074 }
2075 }
2076
2077 if (improvement_flag) { // adding another parent improves network score
2078 dag.Store (next_parent_to_add, child, 1);
2079 num_parents++;
2080 improvement_flag = 0; // reset for next parent
2081 } else {
2082 break; // unable to improve further
2083 }
2084 } while (num_parents < max_parents.list_data[child]);
2085 }
2086
2087
2088 if (reshuffle_order) {
2089 this_score = Compute();
2090
2091 if (this_score > best_score) {
2092 best_score = this_score;
2093 best_dag = (_Matrix &) dag; // store graph optimized from the current ordering
2094 }
2095
2096 ResetGraph (nil);
2097
2098 best_node_order.Permute (1);
2099
2100 for (long i = 0; i < num_nodes; i++) {
2101 long child = best_node_order.list_data[i];
2102 for (long j = 0; j < num_nodes; j++) {
2103 long parent = best_node_order.list_data[j];
2104 orderMx.Store (parent, child, (j > i) ? 1 : 0);
2105 }
2106 }
2107 } else {
2108 break; // only one iteration when node ordering is known
2109 }
2110 }
2111
2112 if (reshuffle_order) {
2113 dag = (_Matrix &) best_dag;
2114 best_node_order.Clear(); // reset to initial state
2115 }
2116
2117
2118 _Matrix * result = new _Matrix(num_nodes * num_nodes, 2, false, true);
2119 result->Store (0, 0, Compute());
2120
2121 for (long row = 0; row < num_nodes; row++) {
2122 for (long col = 0; col < num_nodes; col++) {
2123 result->Store (row*num_nodes+col, 1, dag(row, col));
2124 }
2125 }
2126
2127 return (_Matrix*)(result->makeDynamic());
2128 }
2129
2130
2131
2132 //___________________________________________________________________________________________
GraphMCMC(bool fixed_order)2133 _Matrix * Bgm::GraphMCMC (bool fixed_order)
2134 {
2135 _Matrix * proposed_graph = new _Matrix (num_nodes, num_nodes, false, true),
2136 * orderMx = new _Matrix (num_nodes, num_nodes, false, true);
2137
2138 _Matrix current_graph (num_nodes, num_nodes, false, true),
2139 best_graph (num_nodes, num_nodes, false, true);
2140
2141 hyFloat mcmc_steps, mcmc_burnin, mcmc_samples,
2142 current_score, proposed_score, best_score,
2143 num_randomize,
2144 lk_ratio;
2145
2146 long sampling_interval;
2147
2148 _SimpleList * proposed_order = new _SimpleList();
2149 _SimpleList current_order;
2150
2151
2152 // parse HBL settings
2153 checkParameter (mcmcSteps, mcmc_steps, 0);
2154 if (mcmc_steps == 0) {
2155 _String oops ("You asked HyPhy to run MCMC with zero steps in the chain! Did you forget to set Bgm_MCMC_STEPS?\n");
2156 WarnError (oops);
2157 }
2158
2159 checkParameter (mcmcBurnin, mcmc_burnin, 0);
2160 checkParameter (mcmcSamples, mcmc_samples, 0);
2161 if (mcmc_samples == 0) {
2162 _String oops ("You're asking me to run MCMC without reporting any results. Did you forget to set Bgm_MCMC_SAMPLES?\n");
2163 WarnError (oops);
2164 }
2165
2166 sampling_interval = (long) mcmc_steps / (long) mcmc_samples;
2167
2168 checkParameter (numRandomize, num_randomize, num_nodes*num_nodes);
2169
2170
2171 // Contents: (1) chain trace;
2172 // (2) model-averaged edge probabilities;
2173 // (3) best graph;
2174 _Matrix * result;
2175
2176 checkPointer (result = new _Matrix ( (mcmc_samples > num_nodes*num_nodes) ? mcmc_samples : num_nodes*num_nodes, 3, false, true));
2177
2178
2179
2180 // Set current graph to member object (dag, an empty graph by default)
2181 ResetGraph (proposed_graph);
2182
2183 // does this graph conform to enforced/banned edges, maximum parentage settings?
2184 _SimpleList pars;
2185 for (long chi = 0; chi < num_nodes; chi++) {
2186 pars.Clear();
2187 for (long par = 0; par < num_nodes; par++) {
2188 if ( (*proposed_graph)(par,chi) == 1) {
2189 if ( banned_edges(par,chi) ) {
2190 proposed_graph->Store(par, chi, 0);
2191 ReportWarning(_String("Deleted banned edge ") & par & "->" & chi & " from graph");
2192 }
2193 pars << par;
2194 } else if ( (*proposed_graph)(par,chi) == 0 && enforced_edges(par,chi) ) {
2195 proposed_graph->Store(par, chi, 1);
2196 ReportWarning(_String("Restored enforced edge ") & par & "->" & chi & " to graph");
2197 }
2198 }
2199
2200 if (pars.lLength > max_parents.list_data[chi]) {
2201 ReportWarning(_String("Number of parents (") & (long)pars.lLength & ")exceed maximum setting for child node " & chi & ": " & max_parents.list_data[chi] & "\n");
2202 pars.Permute(1);
2203 for (long p = 0; p < (long)pars.lLength - max_parents.list_data[chi]; p++) {
2204 proposed_graph->Store(pars.list_data[p], chi, 0);
2205 ReportWarning(_String("Deleted excess edge ") & pars.list_data[p] & "->" & chi & " from graph");
2206 }
2207 }
2208 }
2209
2210
2211
2212 if (fixed_order) {
2213 // coerce graph to conform to user-defined node order
2214 if (best_node_order.lLength == 0) {
2215 _String oops ("Cannot run fixed-order graph MCMC without defined order (via order-MCMC). Run CovarianceMatrix(receptacle, BGM) first.");
2216 WarnError (oops);
2217
2218 result = new _Matrix(); // return an empty matrix
2219 } else {
2220 proposed_order->Populate(num_nodes, 0, 0); // allocate memory
2221 for (long i = 0; i < num_nodes; i++) {
2222 proposed_order->list_data[i] = best_node_order.list_data[i];
2223 }
2224
2225 ReportWarning (_String("Transferring best node order to proposal: ") & (_String *)proposed_order->toStr());
2226 }
2227 } else {
2228 if (best_node_order.lLength == 0) {
2229 // quickly generate a node order from graph
2230 for (long order_index, onode, node = 0; node < num_nodes; node++) {
2231 // locate the lowest-ranked parent in the order, if any
2232 for (order_index = 0; order_index < proposed_order->lLength; order_index++) {
2233 onode = proposed_order->list_data[order_index];
2234 if ((*proposed_graph) (onode, node)) {
2235 // insert new node immediately left of this parent
2236 proposed_order->InsertElement ((BaseRef) node, order_index, false, false);
2237 break;
2238 }
2239 }
2240 // if reached end of order list without finding a parent, push node onto start of list
2241 if (order_index == proposed_order->lLength) {
2242 proposed_order->InsertElement ((BaseRef) node, 0, false, false);
2243 }
2244 }
2245 ReportWarning(_String("Constructed node order from graph:\n") & (_String *)proposed_order->toStr() & "\n");
2246 } else {
2247 // restore order
2248 proposed_order->Populate(num_nodes,0,1);
2249 for (long i = 0; i < num_nodes; i++) {
2250 proposed_order->list_data[i] = best_node_order.list_data[i];
2251 }
2252 }
2253 }
2254
2255
2256 // randomize graph
2257 if (num_randomize > 0) {
2258 RandomizeGraph (proposed_graph, proposed_order, (long)num_randomize, fixed_order);
2259 } else {
2260 ReportWarning(_String("NUM_RANDOMIZE set to 0, skipping initial RandomizeGraph()\n"));
2261 }
2262
2263
2264
2265 // Populate order matrix -- and while we're at it, set proposed order to current order
2266 for (long i = 0; i < num_nodes; i++) {
2267 long child = proposed_order->list_data[i];
2268
2269 for (long j = 0; j < num_nodes; j++) {
2270 // if A precedes B, then set entry (A,B) == 1, permitting edge A->B
2271 orderMx->Store (proposed_order->list_data[j], child, (j > i) ? 1 : 0);
2272 }
2273 }
2274 ReportWarning(_String("Populated order matrix\n"));
2275
2276
2277 // status line
2278 #if !defined __UNIX__ || defined __HEADLESS__
2279 long updates = 0;
2280 TimerDifferenceFunction (false);
2281 #if defined __HEADLESS__
2282 SetStatusLine (_HYBgm_STATUS_LINE_MCMC);
2283 #else
2284 SetStatusLine (empty, _HYBgm_STATUS_LINE_MCMC, empty, 0, HY_SL_TASK|HY_SL_PERCENT);
2285 #endif
2286 #endif
2287
2288
2289
2290
2291
2292 current_order = (*proposed_order);
2293 current_graph = (_Matrix &) (*proposed_graph);
2294 best_graph = (_Matrix &) (*proposed_graph);
2295 best_score = proposed_score = current_score = Compute(proposed_graph);
2296
2297 ReportWarning(_String("Initiating MCMC with graph:\n") & (_String *) current_graph.toStr() );
2298
2299
2300 // MAIN LOOP
2301 for (long step = 0; step < mcmc_steps + mcmc_burnin; step++) {
2302 RandomizeGraph (proposed_graph, proposed_order, 1, fixed_order);
2303
2304 proposed_score = Compute(proposed_graph);
2305
2306
2307 #ifdef __DEBUG_GMCMC__
2308 snprintf (bug, sizeof(bug), "Current score = %f\n", current_score);
2309 BufferToConsole (bug);
2310 snprintf (bug, sizeof(bug), "Propose graph:\n");
2311 BufferToConsole (bug);
2312 PrintGraph (proposed_graph);
2313 snprintf (bug, sizeof(bug), "Proposed score = %f\n", proposed_score);
2314 BufferToConsole (bug);
2315 #endif
2316
2317
2318 lk_ratio = exp(proposed_score - current_score);
2319
2320 if (lk_ratio > 1. || genrand_real2() < lk_ratio) { // Metropolis-Hastings
2321 #ifdef __DEBUG_GMCMC__
2322 snprintf (bug, sizeof(bug), "Accept move\n");
2323 BufferToConsole (bug);
2324 #endif
2325 current_graph = (_Matrix &) (*proposed_graph); // accept proposed graph
2326 current_score = proposed_score;
2327
2328 for (long foo = 0; foo < num_nodes; foo++) {
2329 current_order.list_data[foo] = proposed_order->list_data[foo];
2330 }
2331
2332 if (current_score > best_score) {
2333 best_score = current_score;
2334 best_graph = (_Matrix &) current_graph; // keep track of best graph ever visited by chain
2335
2336 #ifdef __DEBUG_GMCMC__
2337 PrintGraph (&best_graph);
2338
2339 snprintf (bug, sizeof(bug), "Update best node ordering: ");
2340 BufferToConsole (bug);
2341 for (long deb = 0; deb < num_nodes; deb++) {
2342 snprintf (bug, sizeof(bug), "%d ", current_order.list_data[deb]);
2343 BufferToConsole (bug);
2344 }
2345 NLToConsole ();
2346 #endif
2347 }
2348 } else {
2349 #ifdef __DEBUG_GMCMC__
2350 snprintf (bug, sizeof(bug), "Reject move\n");
2351 BufferToConsole (bug);
2352 #endif
2353 // revert proposal
2354 for (long row = 0; row < num_nodes; row++) {
2355 proposed_order->list_data[row] = current_order.list_data[row];
2356
2357 for (long col = 0; col < num_nodes; col++) {
2358 proposed_graph->Store(row, col, current_graph(row, col));
2359 }
2360 }
2361 }
2362
2363
2364
2365 if (step >= mcmc_burnin) { // handle output
2366 if ( (step-(long)mcmc_burnin) % sampling_interval == 0) {
2367 long entry = (long int) ((step-mcmc_burnin) / sampling_interval);
2368
2369 result->Store (entry, 0, current_score); // update chain trace
2370
2371 for (long row = 0; row < num_nodes; row++) { // update edge tallies
2372 for (long offset=row*num_nodes, col = 0; col < num_nodes; col++) {
2373 // row = parent, col = child
2374 result->Store (offset+col, 1, (*result)(offset+col,1) + current_graph(row, col));
2375 }
2376 }
2377 }
2378 }
2379
2380 #if !defined __UNIX__ || defined __HEADLESS__
2381 if (TimerDifferenceFunction(true)>1.0) { // time to update
2382 updates ++;
2383 _String statusLine = _HYBgm_STATUS_LINE_MCMC & " " & (step+1) & "/" & (mcmc_steps + mcmc_burnin)
2384 & " steps (" & (1.0+step)/updates & "/second)";
2385 #if defined __HEADLESS__
2386 SetStatusLine (statusLine);
2387 #else
2388 SetStatusLine (empty,statusLine,empty,100*step/(mcmc_steps + mcmc_burnin),HY_SL_TASK|HY_SL_PERCENT);
2389 #endif
2390 TimerDifferenceFunction(false); // reset timer for the next second
2391 yieldCPUTime(); // let the GUI handle user actions
2392 if (terminate_execution) { // user wants to cancel the analysis
2393 break;
2394 }
2395 }
2396 #else
2397 if (step % (long)((mcmc_steps + mcmc_burnin)/100) == 0) {
2398 ReportWarning (_String ("GraphMCMC at step ") & step & " of " & (mcmc_steps+mcmc_burnin) & " with posterior " & current_score & " and graph:\n" & (_String *)current_graph.toStr() & " and order:\n" & (_String *)current_order.toStr() & "\n");
2399 }
2400 #endif
2401 }
2402
2403 // convert edge tallies to frequencies, and report best graph
2404 for (long row = 0; row < num_nodes; row++) {
2405 for (long offset=row*num_nodes, col = 0; col < num_nodes; col++) {
2406 result->Store (offset+col, 1, (*result)(offset+col,1)/mcmc_samples);
2407 result->Store (offset+col, 2, best_graph(row, col));
2408 }
2409 }
2410
2411
2412 // set dag member to last visit graph
2413 dag = (_Matrix &) current_graph;
2414 best_node_order = current_order;
2415
2416 delete (proposed_graph);
2417 delete (orderMx);
2418 delete (proposed_order);
2419
2420 return (_Matrix *) (result->makeDynamic());
2421 }
2422
2423
2424
2425 //___________________________________________________________________________________________
2426 // DEPRECATED
TryEdge(long child,long parent,long operation,hyFloat old_score)2427 hyFloat Bgm::TryEdge (long child, long parent, long operation, hyFloat old_score)
2428 {
2429 #ifdef _NEVER_DEFINED_
2430 hyFloat log_score,
2431 last_state1 = dag (child, parent),
2432 last_state2 = dag (parent, child);
2433
2434 /*
2435 if (is_marginal) // modification affects only child node, don't recalculate other scores
2436 log_marginal = is_discrete(child,0) ? ComputeDiscreteScore (child) : ComputeContinuousScore (child);
2437 */
2438
2439 switch (operation) {
2440 case 0:
2441 if (MarginalSum (FALSE, child) < max_parents && banned_edges(parent,child) == 0) {
2442 dag.Store (parent, child, 1.); // add an edge [parent] --> [child]
2443 dag.Store (child, parent, 0.);
2444 } else {
2445 return old_score;
2446 }
2447 break;
2448
2449 case 1: // reverse an edge
2450 if (MarginalSum (FALSE, parent) < max_parents && banned_edges(child,parent) == 0) {
2451 dag.Store (child, parent, last_state2); // switch states
2452 dag.Store (parent, child, last_state1);
2453 } else {
2454 return old_score;
2455 }
2456 break;
2457
2458 case 2: // delete an edge
2459 dag.Store (child, parent, 0.);
2460 dag.Store (parent, child, 0.);
2461 break;
2462
2463 default:
2464 return old_score;
2465 break;
2466 }
2467
2468 // PrintGraph(nil);
2469
2470 if (IsCyclic()) { // new graph is cyclic, reject
2471 dag.Store (child, parent, last_state1);
2472 dag.Store (parent, child, last_state2);
2473 return old_score;
2474 }
2475
2476
2477 /*
2478 if (is_marginal) // update one term only
2479 log_score = old_score - log_marginal + (is_discrete(child,0) ? ComputeDiscreteScore (child) : ComputeContinuousScore (child));
2480 else
2481 log_score = Compute();
2482 */
2483
2484
2485 log_score = Compute();
2486
2487 if (log_score > old_score) { // keep new graph
2488 return log_score;
2489 } else {
2490 // restore previous settings
2491 dag.Store (child, parent, last_state1);
2492 dag.Store (parent, child, last_state2);
2493 }
2494
2495 return old_score;
2496 #endif
2497 return 0;
2498 }
2499
2500
2501
2502
2503 //___________________________________________________________________________________________
2504 // Calculating equation (8) of Friedman and Koller (2003) requires the term:
2505 // log ( Sum ( score(x_i, U | D) ) )
2506 // where x_i are very small numbers stored as log(x_i)'s but taking the exponential of
2507 // the log values can result in numerical underflow.
2508
LogSumExpo(_GrowingVector * log_values)2509 hyFloat Bgm::LogSumExpo (_GrowingVector * log_values)
2510 {
2511 long size = log_values->GetUsed();
2512 hyFloat sum_exponents = 0.;
2513
2514
2515 // handle some trivial cases
2516 if (size == 0) {
2517 return 0.; // log(exp(0)) = log(1) = 0
2518 } else if (size == 1) {
2519 return (*log_values)(0,0); // log(exp(log(x)))
2520 }
2521
2522
2523 // find the largest (least negative) log-value
2524 hyFloat max_log = (*log_values) (0, 0),
2525 this_log;
2526
2527 for (long val = 1; val < size; val++) {
2528 this_log = (*log_values) (val, 0);
2529
2530 if (this_log > max_log) {
2531 max_log = this_log;
2532 }
2533 }
2534
2535 // go back through log values and increment by max log value
2536 // This will cause some underflow for the smallest values, but we can
2537 // use this approximation to handle very large ranges of values.
2538 // NOTE: subtracting a negative value.
2539 for (long val = 0; val < size; val++) {
2540 sum_exponents += exp( (*log_values) (val, 0) - max_log );
2541 }
2542
2543 return (log(sum_exponents) + max_log);
2544 }
2545
2546
2547 //___________________________________________________________________________________________
2548 // Markov chain Monte Carlo for Bgm
CovarianceMatrix(_SimpleList * unused)2549 _PMathObj Bgm::CovarianceMatrix (_SimpleList * unused)
2550 {
2551 //char bug [255];
2552
2553
2554 hyFloat mcmc_steps,
2555 mcmc_burnin,
2556 mcmc_samples,
2557 mcmc_nchains,
2558 mcmc_temp;
2559
2560 _SimpleList * node_order;
2561
2562 if (last_node_order.lLength == 0) {
2563 node_order = new _SimpleList (num_nodes, 0, 1); // populate with series (0, 1, ..., num_nodes)
2564
2565
2566 node_order->Permute(1); // initialize with randomized node ordering
2567
2568 } else {
2569 node_order = new _SimpleList (last_node_order);
2570 }
2571
2572
2573
2574 _Matrix * mcmc_output;
2575
2576
2577 // acquisition of HBL arguments with a few sanity checks
2578 checkParameter (mcmcSteps, mcmc_steps, 0);
2579 checkParameter (mcmcBurnin, mcmc_burnin, 0);
2580 checkParameter (mcmcSamples, mcmc_samples, 0);
2581 checkParameter (mcmcNumChains, mcmc_nchains, 1);
2582 checkParameter (mcmcTemperature, mcmc_temp, 1);
2583
2584 if (mcmc_steps == 0) {
2585 _String oops ("You asked HyPhy to run MCMC with zero steps in the chain! Did you forget to set Bgm_MCMC_STEPS?\n");
2586 WarnError (oops);
2587 }
2588
2589 if (mcmc_samples == 0) {
2590 _String oops ("You're asking me to run MCMC without reporting any results. Did you forget to set Bgm_MCMC_SAMPLES?\n");
2591 WarnError (oops);
2592 }
2593
2594
2595
2596 // allocate space for output
2597 if (mcmc_nchains == 1) {
2598 mcmc_output = RunColdChain (node_order, (long) mcmc_burnin, -1); // negative argument indicates no sampling
2599 delete (mcmc_output); // discard burn-in
2600
2601 mcmc_output = RunColdChain (node_order, (long) mcmc_steps, (long int) (mcmc_steps / mcmc_samples) );
2602 } else { // run coupled MCMC
2603 node_order->Permute(1);
2604 // RunHotChain (node_order, (long)
2605
2606 /* STILL UNDER DEVELOPMENT! */
2607 }
2608
2609 DeleteObject (node_order);
2610
2611 return mcmc_output;
2612 }
2613
2614
2615
2616 //___________________________________________________________________________________________
RunColdChain(_SimpleList * current_order,long nsteps,long sample_lag)2617 _Matrix * Bgm::RunColdChain (_SimpleList * current_order, long nsteps, long sample_lag)
2618 {
2619 /* Execute Metropolis sampler using a swap of two nodes in an ordered sequence
2620 as a proposal function. The posterior probabilities of edges in the network are
2621 stored as a member matrix [edge_posteriors]. Note that the total number of
2622 possible orderings (i.e. permutations of a sequence of length N) is factorial(N),
2623 and can possibly be computed exactly for N < 8. */
2624
2625 #ifdef DEBUG_RCC
2626 char buf [255];
2627 #endif
2628
2629 long row,
2630 first_node, second_node,
2631 mcmc_samples = (long int) (nsteps / sample_lag);
2632
2633 _SimpleList proposed_order;
2634
2635 _Matrix * mcmc_output = new _Matrix (mcmc_samples > (num_nodes*num_nodes) ? mcmc_samples : (num_nodes*num_nodes), 4, false, true);
2636 // Contents: (1) chain trace; (2) edge marginal posterior probabilities;
2637 // (3) best node ordering; (4) last node ordering (for restarting chains).
2638
2639 _List * clist = new _List (); // compute list
2640
2641
2642 InitComputeLists (clist); // storage for marginal node- and edge-scores
2643
2644
2645
2646 hyFloat lk_ratio,
2647 prob_current_order = Compute (current_order, clist),
2648 prob_proposed_order = 0.,
2649 best_prob = prob_current_order;
2650
2651
2652 /* SLKP 20070926
2653 Add user feedback via the console window status bar
2654 */
2655 VerbosityLevel();
2656 long howManyTimesUpdated = 0; // how many times has the line been updated; is the same as the # of seconds
2657 TimerDifferenceFunction(false); // save initial timer; will only update every 1 second
2658 #ifdef __HEADLESS__
2659 SetStatusLine (_HYBgm_STATUS_LINE_MCMC & (sample_lag < 0 ? _String(" burnin"):empty));
2660 #else
2661 #ifndef __UNIX__
2662 SetStatusLine (empty,_HYBgm_STATUS_LINE_MCMC & (sample_lag < 0 ? _String(" burnin"):empty),empty,0,HY_SL_TASK|HY_SL_PERCENT);
2663 #else
2664 _String * progressReportFile = NULL;
2665 _Variable * progressFile = CheckReceptacle (&optimizationStatusFile, empty, false);
2666
2667 if (progressFile->ObjectClass () == STRING) {
2668 progressReportFile = ((_FString*)progressFile->Compute())->theString;
2669 }
2670 ConsoleBGMStatus (_HYBgm_STATUS_LINE_MCMC & (sample_lag < 0 ? _String(" burnin"):empty), -1., progressReportFile);
2671 #endif
2672 #endif
2673
2674 /* SLKP */
2675
2676
2677 best_node_order = (_SimpleList &) (*current_order); // initialize class member variable
2678
2679 #ifdef DEBUG_RCC
2680 snprintf (buf, sizeof(buf), "Initial node order (log L = %f): ", prob_current_order);
2681 BufferToConsole (buf);
2682 for (long i = 0; i < num_nodes; i++) {
2683 snprintf (buf, sizeof(buf), "%d ", best_node_order.list_data[i]);
2684 BufferToConsole (buf);
2685 }
2686 NLToConsole();
2687 #endif
2688
2689 for (long step = 0; step < nsteps; step++) {
2690 // copy contents of current order to proposed ordering for permutation
2691 proposed_order.Clear();
2692 proposed_order.Duplicate (current_order);
2693
2694
2695 // swap random nodes in ordered sequence
2696 first_node = genrand_int32() % num_nodes;
2697 second_node = genrand_int32() % num_nodes;
2698
2699 while (first_node == second_node) { // keep sampling until pick different node
2700 second_node = genrand_int32() % num_nodes;
2701 }
2702
2703 proposed_order.Swap (first_node, second_node);
2704
2705
2706 // calculate probability of proposed order --- edge posterior probs loaded into member variable
2707 prob_proposed_order = Compute (&proposed_order, clist);
2708
2709 #ifdef DEBUG_RCC
2710 snprintf (buf, sizeof(buf), "Proposed node order (log L = %f): ", prob_proposed_order);
2711 BufferToConsole (buf);
2712 for (long i = 0; i < num_nodes; i++) {
2713 snprintf (buf, sizeof(buf), "%d ", proposed_order.list_data[i]);
2714 BufferToConsole (buf);
2715 }
2716 NLToConsole();
2717 #endif
2718
2719
2720 lk_ratio = exp(prob_proposed_order - prob_current_order);
2721
2722 if (lk_ratio > 1. || genrand_real2() < lk_ratio) { // then set current to proposed order
2723 (*current_order).Clear();
2724 (*current_order) << proposed_order;
2725 prob_current_order = prob_proposed_order;
2726
2727 #ifdef DEBUG_RCC
2728 snprintf (buf, sizeof(buf), "Accept\n");
2729 BufferToConsole (buf);
2730 #endif
2731
2732 if (prob_proposed_order > best_prob) {
2733 best_prob = prob_proposed_order; // update best node ordering
2734 best_node_order = proposed_order;
2735 }
2736 }
2737
2738
2739 if (sample_lag >= 0) {
2740 // write Markov chain state to output matrix
2741
2742 if (step % sample_lag == 0 && step > 0) { // AFYP 2011/3/2 added this second conditional because the zero-th step was getting recorded
2743 _GrowingVector * gv;
2744 hyFloat denom;
2745
2746 row = (long int) (step / sample_lag) - 1; // AFYP 2011/3/2 need to shift over other info to maintain 0-index
2747
2748 // report log likelhood
2749 mcmc_output->Store (row, 0, prob_current_order);
2750
2751
2752 #ifdef _DEBUG_RCC
2753 snprintf (buf, sizeof(buf), "Contents of clist:\n");
2754 BufferToConsole (buf);
2755
2756 for (long i = 0; i < num_nodes; i++) {
2757 for (long j = 0; j < num_nodes; j++) {
2758 gv = (_GrowingVector *) clist->list_data[i * num_nodes + j];
2759
2760 snprintf (buf, sizeof(buf), "i=%d, j=%d, n=%d: ", i, j, gv->GetUsed());
2761 BufferToConsole (buf);
2762
2763 for (long k = 0; k < gv->GetUsed(); k++) {
2764 snprintf (buf, sizeof(buf), "%f ", (*gv)(k,0));
2765 BufferToConsole (buf);
2766 }
2767
2768 NLToConsole ();
2769 }
2770 }
2771 #endif
2772
2773
2774
2775 // compute marginal edge posteriors
2776 for (long child = 0; child < num_nodes; child++) {
2777 gv = (_GrowingVector *) clist->list_data[child * num_nodes + child];
2778 denom = (*gv)(0, 0); // first entry holds node marginal post pr
2779
2780
2781 for (long edge, parent = 0; parent < num_nodes; parent++) {
2782 if (parent == child) {
2783 continue;
2784 }
2785
2786 edge = child * num_nodes + parent;
2787 gv = (_GrowingVector *) clist->list_data[edge];
2788
2789
2790 if (gv->GetUsed() > 0) {
2791 #ifdef DEBUG_RCC
2792 snprintf (buf, sizeof(buf), "edge %d: pp = %f, gv = ", edge, (*mcmc_output)(edge, 1));
2793 BufferToConsole (buf);
2794
2795 for (long i = 0; i < gv->GetUsed(); i++) {
2796 snprintf (buf, sizeof(buf), "%f ", (*gv) (i,0));
2797 BufferToConsole (buf);
2798 }
2799
2800 snprintf (buf, sizeof(buf), "sum to %f", LogSumExpo (gv));
2801 BufferToConsole (buf);
2802 #endif
2803
2804 mcmc_output->Store (edge, 1, (*mcmc_output)(edge, 1) + exp (LogSumExpo(gv) - denom));
2805
2806 #ifdef DEBUG_RCC
2807 snprintf (buf, sizeof(buf), "= %f\n", (*mcmc_output)(edge, 1));
2808 BufferToConsole (buf);
2809 #endif
2810 }
2811
2812
2813 }
2814 }
2815 }
2816 }
2817
2818
2819 /*SLKP 20070926; include progress report updates */
2820 if (TimerDifferenceFunction(true)>1.0) { // time to update
2821 howManyTimesUpdated ++;
2822 _String statusLine = _HYBgm_STATUS_LINE_MCMC & (sample_lag < 0 ? _String(" burnin"):empty) & " " & (step+1) & "/" & nsteps
2823 & " steps (" & (1.0+step)/howManyTimesUpdated & "/second)";
2824 #if defined __HEADLESS__
2825 SetStatusLine (statusLine);
2826 #else
2827 #if !defined __UNIX__ || defined __HYPHYQT__ || defined __HYPHY_GTK__
2828 SetStatusLine (empty,statusLine,empty,100*step/(nsteps),HY_SL_TASK|HY_SL_PERCENT);
2829 yieldCPUTime(); // let the GUI handle user actions
2830 if (terminate_execution) { // user wants to cancel the analysis
2831 break;
2832 }
2833 #endif
2834 #if defined __UNIX__ && ! defined __HEADLESS__ && !defined __HYPHYQT__ && !defined __HYPHY_GTK__
2835 ConsoleBGMStatus (statusLine, 100.*step/(nsteps), progressReportFile);
2836 #endif
2837 #endif
2838
2839 TimerDifferenceFunction(false); // reset timer for the next second
2840 }
2841 /* SLKP */
2842
2843 }
2844 // end loop over steps
2845
2846
2847 if (sample_lag >= 0) {
2848 // convert sums of edge posterior probs. in container to means
2849 for (long edge = 0; edge < num_nodes * num_nodes; edge++) {
2850 mcmc_output->Store (edge, 1, (*mcmc_output)(edge,1) / mcmc_samples);
2851 }
2852 }
2853
2854
2855 // export node ordering info
2856 for (long node = 0; node < num_nodes; node++) {
2857 mcmc_output->Store (node, 2, (hyFloat) (best_node_order.list_data[node]));
2858 mcmc_output->Store (node, 3, (hyFloat) (current_order->list_data[node]));
2859 }
2860
2861 last_node_order = (_SimpleList &) (*current_order);
2862
2863
2864
2865 // release cached scores (assuming that this is the last analysis to be done!)
2866 DumpComputeLists (clist);
2867
2868 /*SLKP 20070926; include progress report updates */
2869 #if !defined __UNIX__ || defined __HEADLESS__ || defined __HYPHYQT__ || defined __HYPHY_GTK__
2870 SetStatusLine (_HYBgm_STATUS_LINE_MCMC_DONE & (sample_lag < 0 ? _String(" burnin"):empty));
2871 #endif
2872 #if defined __UNIX__ && ! defined __HEADLESS__ && !defined __HYPHYQT__ && !defined __HYPHY_GTK__
2873 ConsoleBGMStatus (_HYBgm_STATUS_LINE_MCMC_DONE & (sample_lag < 0 ? _String(" burnin"):empty), -1.0, progressReportFile);
2874 #endif
2875 /* SLKP */
2876
2877 return mcmc_output;
2878 }
2879
2880
2881
2882 //___________________________________________________________________________________________
2883 #ifdef __NEVER_DEFINED__
RunHotChain(_SimpleList * current_order,long nsteps,long sample_lag,hyFloat temper)2884 void Bgm::RunHotChain (_SimpleList * current_order, long nsteps, long sample_lag, hyFloat temper)
2885 {
2886 /* Execute Metropolis sampler using a swap of two nodes in an ordered sequence
2887 as a proposal function. The posterior probabilities of edges in the network are
2888 stored as a member matrix [edge_posteriors]. Note that the total number of
2889 possible orderings (i.e. permutations of a sequence of length N) is factorial(N),
2890 and can possibly be computed exactly for N < 8. */
2891
2892 _List * clist;
2893
2894 hyFloat prob_current_order = Compute (current_order, clist),
2895 prob_proposed_order = 0.;
2896
2897 static long step_counter = 0;
2898
2899 long sample_interval = (long int) ((nsteps - mcmc_burnin) / mcmc_samples);
2900
2901 _SimpleList proposed_order;
2902
2903 bool accept_step = FALSE;
2904
2905 /* SLKP 20070926
2906 Add user feedback via the console window status bar
2907 */
2908 #if !defined __UNIX__ || defined __HEADLESS__
2909 long howManyTimesUpdated = 0; // how many times has the line been updated; is the same as the # of seconds
2910 TimerDifferenceFunction(false); // save initial timer; will only update every 1 second
2911 SetStatusLine (empty,_HYBgm_STATUS_LINE_MCMC,empty,0,HY_SL_TASK|HY_SL_PERCENT);
2912 #endif
2913 /* SLKP */
2914
2915
2916 for (long step = 0; step < mcmc_steps; step++) {
2917 // copy contents of current order to proposed ordering for permutation
2918 proposed_order.Clear();
2919 proposed_order.Duplicate (current_order);
2920
2921
2922 // swap random nodes in ordered sequence
2923 long first_node = genrand_int32 () % proposed_order.lLength,
2924 second_node = genrand_int32 () % proposed_order.lLength;
2925
2926 while (first_node == second_node) { // keep sampling until pick different node
2927 second_node = genrand_int32 () % proposed_order.lLength;
2928 }
2929
2930 proposed_order.Swap(first_node, second_node);
2931
2932
2933 // calculate probability of proposed order --- edge posterior probs loaded into member variable
2934 prob_proposed_order = Compute (&proposed_order, FALSE);
2935
2936
2937 hyFloat lk_ratio = exp(prob_proposed_order - prob_current_order);
2938
2939 accept_step = FALSE;
2940
2941 if (lk_ratio > 1.) {
2942 accept_step = TRUE; // accept greater likelihood
2943 } else if (genrand_real2() < lk_ratio) {
2944 accept_step = TRUE;
2945 }
2946
2947
2948 if (accept_step) { // then set current to proposed order
2949 (*current_order).Clear();
2950 (*current_order) << proposed_order;
2951 prob_current_order = prob_proposed_order;
2952 }
2953
2954 // write Markov chain state to output matrix
2955 if (step % sample_interval == 0 && step >= mcmc_burnin) {
2956 long row = (long int) ((step-mcmc_burnin) / sample_interval);
2957
2958 mcmc_chain->Store (row, 0, prob_current_order);
2959
2960 for (long edge = 0; edge < num_nodes*num_nodes; edge++) {
2961 long child = edge % num_nodes,
2962 parent = edge / num_nodes;
2963
2964 mcmc_chain->Store (edge, 1, (*mcmc_chain)(edge, 1) + edge_posteriors(child, parent));
2965 }
2966 }
2967
2968 /*SLKP 20070926; include progress report updates */
2969 #if !defined __UNIX__ || defined __HEADLESS__
2970 if (TimerDifferenceFunction(true)>1.0) { // time to update
2971 howManyTimesUpdated ++;
2972 _String statusLine = _HYBgm_STATUS_LINE_MCMC & " " & (step+1) & "/" & mcmc_steps
2973 & " steps (" & (1.0+step)/howManyTimesUpdated & "/second)";
2974 SetStatusLine (empty,statusLine,empty,100*step/(mcmc_steps),HY_SL_TASK|HY_SL_PERCENT);
2975 TimerDifferenceFunction(false); // reset timer for the next second
2976 yieldCPUTime(); // let the GUI handle user actions
2977 if (terminate_execution) { // user wants to cancel the analysis
2978 break;
2979 }
2980 }
2981 #endif
2982 /* SLKP */
2983 }
2984 // end loop over steps
2985
2986
2987 for (long edge = 0; edge < num_nodes * num_nodes; edge++) {
2988 // convert sums of edge posterior probs. in container to means
2989 mcmc_chain->Store (edge, 1, (*mcmc_chain)(edge,1) / mcmc_samples);
2990 }
2991
2992
2993 // release cached scores (assuming that this is the last analysis to be done!)
2994 DumpComputeLists ();
2995
2996 /*SLKP 20070926; include progress report updates */
2997 #if !defined __UNIX__ || defined __HEADLESS__
2998 SetStatusLine (_HYBgm_STATUS_LINE_MCMC_DONE);
2999 #endif
3000 /* SLKP */
3001
3002 }
3003
3004 #endif
3005
3006
3007
3008 //___________________________________________________________________________________________
3009 #ifdef __AFYP_DEVELOPMENT2__
CacheNetworkParameters(void)3010 void Bgm::CacheNetworkParameters (void)
3011 {
3012 _SimpleList multipliers, // for indexing parent combinations (j)
3013 parents;
3014
3015 long num_parent_combos,
3016 r_i;
3017
3018 _Matrix n_ijk,
3019 theta_ijk;
3020
3021 network_parameters.Clear(); // reset cache
3022
3023
3024 for (long i = 0; i < num_nodes; i++) {
3025 // reset variables to defaults
3026 multipliers.Clear();
3027 multipliers << 1;
3028
3029 n_ijk.Clear();
3030 theta_ijk.Clear();
3031
3032 num_parent_combos = 1;
3033 r_i = num_levels.list_data[i];
3034
3035
3036 // assemble parents based on current graph
3037 for (long par = 0; par < num_nodes; par++) {
3038 if (dag(par, i) == 1 && is_discrete.list_data[par]) {
3039 parents << par; // list of parents will be in ascending numerical order
3040 num_parent_combos *= num_levels.list_data[par];
3041 multipliers << num_parent_combos;
3042 }
3043 }
3044
3045
3046 // re-allocate space in matrix
3047 CreateMatrix (&n_ijk, num_parent_combos, r_i+1, false, true, false);
3048 CreateMatrix (&theta_ijk, num_parent_combos, r_i, false, true, false);
3049
3050
3051 // tally observations
3052 for (long obs = 0; obs < obsData->GetHDim(); obs++) {
3053 long index = 0,
3054 child_state = (*obsData)(obs, i);
3055
3056 // is there a faster algorithm for this? - afyp
3057 for (long par = 0; par < parents.lLength; par++) {
3058 long this_parent = parents.list_data[par],
3059 this_parent_state = (*obsData)(obs, this_parent);
3060
3061 index += this_parent_state * multipliers.list_data[par];
3062 }
3063
3064 n_ijk.Store ((long) index, child_state, n_ijk(index, child_state) + 1);
3065 n_ijk.Store ((long) index, r_i, n_ijk(index, r_i) + 1); // store n_ij sum in last entry
3066 }
3067
3068
3069 // compute expected network conditional probabilities (Cooper and Herskovits 1992 eq.16)
3070 for (long j = 0; j < num_parent_combos; j++) {
3071 hyFloat denom = n_ijk(j,r_i+1) + r_i;
3072
3073 for (long k = 0; k < r_i; k++) {
3074 theta_ijk.Store (j, k, (n_ijk(j,k)+1.) / denom);
3075 }
3076 }
3077
3078
3079 // append parameters to member list
3080 network_parameters && (&theta_ijk);
3081 }
3082 }
3083 #endif
3084
3085
3086 //___________________________________________________________________________________________
3087 #ifdef __AFYP_DEVELOPMENT2__
SimulateDiscreteCases(long ncases)3088 _Matrix * Bgm::SimulateDiscreteCases (long ncases)
3089 {
3090 /* Identify nodes without parents in given graph.
3091 Draw random state assignments based on learned parameters (orphan score).
3092 Use graph matrix transpose to index child nodes.
3093 Visit children and randomly assign states dependent on parent assignment.
3094 Rinse and repeat.
3095 */
3096 CacheNetworkParameters();
3097
3098 _Matrix gad = dag.Transpose();
3099 _Matrix * cases = new _Matrix (num_nodes, ncases, false, true);
3100
3101 // proceed using postorder traversal (check for parents before resolving node)
3102
3103 }
3104 #endif
3105
3106
3107
3108 //___________________________________________________________________________________________
3109 #ifdef __AFYP_DEVELOPMENT2__
PostOrderInstantiate(long node_id,_SimpleList & results)3110 void Bgm::PostOrderInstantiate (long node_id, _SimpleList & results)
3111 {
3112 _SimpleList parents;
3113
3114 // assign parents first
3115 for (long par = 0; par < num_nodes; par++) {
3116 if (par != node_id && dag (par, node_id) == 1) {
3117 PostOrderInstantiate (par, results);
3118 parents << par;
3119 }
3120 }
3121
3122 // assign random state given parents
3123 for (long pindex = 0; pindex < parents.lLength; pindex++) {
3124 long par = parents.list_data[pindex];
3125
3126 }
3127 }
3128
3129
PredictNode(long node_id,_Matrix * assignments)3130 long Bgm::PredictNode (long node_id, _Matrix * assignments)
3131 {
3132
3133 }
3134 #endif
3135
3136
3137 //___________________________________________________________________________________________
3138 #if defined __HYPHYMPI__
SerializeBgm(_String & rec)3139 void Bgm::SerializeBgm (_String & rec)
3140 {
3141 char buf [255];
3142 _String * bgmName = (_String *) bgmNamesList (bgmList._SimpleList::Find((long)this));
3143 _String dataStr,
3144 dataName ("kBGMData"),
3145 banStr,
3146 banName ("ban_matrix");
3147
3148 hyFloat mcem_max_steps, mcem_burnin, mcem_sample_size;
3149
3150 checkParameter (mcemMaxSteps, mcem_max_steps, 10000);
3151 checkParameter (mcemBurnin, mcem_burnin, 1000);
3152 checkParameter (mcemSampleSize, mcem_sample_size, 100);
3153
3154 rec << "PRINT_DIGITS=-1;\n";
3155 rec << "USE_MPI_CACHING=1;\n";
3156
3157 // write utility functions
3158 rec << "function make_dnode (id,n,maxp)\n";
3159 rec << "{\ndnode={};\n";
3160 rec << "dnode[\"NodeID\"]=id;\n";
3161 rec << "dnode[\"PriorSize\"]=n;\n";
3162 rec << "dnode[\"MaxParents\"]=maxp;\n";
3163 rec << "return dnode;\n}\n";
3164
3165 rec << "function make_cnode (id,n,maxp,mean,prec)\n";
3166 rec << "{\ncnode={};\n";
3167 rec << "cnode[\"NodeID\"]=id;\n";
3168 rec << "cnode[\"PriorSize\"]=n;\n";
3169 rec << "cnode[\"MaxParents\"]=maxp;\n";
3170 rec << "cnode[\"PriorMean\"]=mean;\n";
3171 rec << "cnode[\"PriorVar\"]=prec;\n";
3172 rec << "return cnode;\n}\n";
3173
3174
3175 // prepare assortative lists
3176 rec << "dnodes={};\ncnodes={};\n";
3177
3178 for (long node_id = 0; node_id < num_nodes; node_id++) {
3179 if (is_discrete.list_data[node_id]) {
3180 rec << "dnodes[Abs(dnodes)]=make_dnode(";
3181 snprintf (buf, sizeof(buf), "%ld,%ld,%ld", node_id, (long)prior_sample_size(node_id,0), (long)max_parents.list_data[node_id]);
3182 rec << buf;
3183 rec << ");\n";
3184 } else {
3185 rec << "cnodes[Abs(cnodes)]=make_cnode(";
3186 snprintf (buf, sizeof(buf), "%ld,%ld,%ld,%f,%f", node_id, (long)prior_sample_size(node_id,0), (long)max_parents.list_data[node_id],
3187 prior_mean(node_id,0), prior_precision(node_id,0));
3188 rec << buf;
3189 rec << ");\n";
3190 }
3191 }
3192
3193 // write BGM constructor
3194 rec << "BGM ";
3195 rec << bgmName;
3196 rec << "=(dnodes,cnodes);\n";
3197
3198 // missing data imputation settings
3199 snprintf (buf, sizeof(buf), "BGM_MCEM_MAXSTEPS = %d;\n", (long)mcem_max_steps);
3200 rec << buf;
3201 snprintf (buf, sizeof(buf), "BGM_MCEM_BURNIN = %d;\n", (long)mcem_burnin);
3202 rec << buf;
3203 snprintf (buf, sizeof(buf), "BGM_MCEM_SAMPLES = %d;\n", (long)mcem_sample_size);
3204 rec << buf;
3205
3206 // ban matrix command has to come BEFORE setting data matrix (which calls CacheNodeScores)
3207 rec << "ban_matrix=";
3208 rec << (_String *)banned_edges.toStr();
3209 rec << ";\n";
3210 rec << "SetParameter(";
3211 rec << bgmName;
3212 rec << ",BGM_BAN_MATRIX,ban_matrix);\n";
3213
3214 // serialize data matrix and assign to BGM
3215 rec << "kBGMData=";
3216 rec << (_String *)obsData->toStr();
3217 rec << ";\n";
3218 rec << "SetParameter(";
3219 rec << bgmName;
3220 rec << ",BGM_DATA_MATRIX,kBGMData);\n";
3221
3222 }
3223 #endif
3224
3225 //___________________________________________________________________________________________
3226 //___________________________________________________________________________________________
3227 //___________________________________________________________________________________________
3228 //___________________________________________________________________________________________
3229 //___________________________________________________________________________________________
3230
3231
3232 #endif
3233