1
2 ///////////////////////////////////////////////////////////
3 // //
4 // SAGA //
5 // //
6 // System for Automated Geoscientific Analyses //
7 // //
8 // Application Programming Interface //
9 // //
10 // Library: SAGA_API //
11 // //
12 //-------------------------------------------------------//
13 // //
14 // mRMR.cpp //
15 // //
16 // Copyright (C) 2014 by //
17 // Olaf Conrad //
18 // //
19 //-------------------------------------------------------//
20 // //
21 // This file is part of 'SAGA - System for Automated //
22 // Geoscientific Analyses'. //
23 // //
24 // This library is free software; you can redistribute //
25 // it and/or modify it under the terms of the GNU Lesser //
26 // General Public License as published by the Free //
27 // Software Foundation, either version 2.1 of the //
28 // License, or (at your option) any later version. //
29 // //
30 // This library is distributed in the hope that it will //
31 // be useful, but WITHOUT ANY WARRANTY; without even the //
32 // implied warranty of MERCHANTABILITY or FITNESS FOR A //
33 // PARTICULAR PURPOSE. See the GNU Lesser General Public //
34 // License for more details. //
35 // //
36 // You should have received a copy of the GNU Lesser //
37 // General Public License along with this program; if //
38 // not, see <http://www.gnu.org/licenses/>. //
39 // //
40 //-------------------------------------------------------//
41 // //
42 // e-mail: oconrad@saga-gis.org //
43 // //
44 // contact: Olaf Conrad //
45 // Institute of Geography //
46 // University of Hamburg //
47 // Germany //
48 // //
49 ///////////////////////////////////////////////////////////
50
51 //---------------------------------------------------------
52 #ifdef WITH_MRMR
53
54 //---------------------------------------------------------
55 // A C++ program to implement the mRMR selection using mutual information
56 // written by Hanchuan Peng.
57 //
58 // Disclaimer: The author of program is Hanchuan Peng
59 // at <penghanchuan@yahoo.com>.
60 //
61 // The CopyRight is reserved by the author.
62 //
63 // Last modification: Jan/25/2007
64 //
65 // by Hanchuan Peng
66 // 2005-10-24: finalize the computing parts of the whole program
67 // 2005-10-25: add non-discretization for the classification variable. Also slightly change some output info for the web application
68 // 2005-11-01: add control to the user-defined max variable number and sample number
69 // 2006-01-26: add gnu_getline.c to convert the code to be compilable under Max OS.
70 // 2007-01-25: change the address info
71
72 //---------------------------------------------------------
73 #include "mat_tools.h"
74 #include "parameters.h"
75 #include "table.h"
76
77
78 ///////////////////////////////////////////////////////////
79 // //
80 // //
81 // //
82 ///////////////////////////////////////////////////////////
83
84 //---------------------------------------------------------
85 #define DELETE_ARRAY(p) if( p ) { delete[]p; p = NULL; }
86
87 //---------------------------------------------------------
88 enum ESG_mRMR_Selection
89 {
90 SG_mRMR_SELECTION_RANK = 0,
91 SG_mRMR_SELECTION_INDEX,
92 SG_mRMR_SELECTION_NAME,
93 SG_mRMR_SELECTION_SCORE
94 };
95
96
97 ///////////////////////////////////////////////////////////
98 // //
99 ///////////////////////////////////////////////////////////
100
101 //---------------------------------------------------------
CSG_mRMR(void)102 CSG_mRMR::CSG_mRMR(void)
103 {
104 m_Samples = NULL;
105 m_nSamples = 0;
106 m_nVars = 0;
107 m_bDiscretized = false; // initialize the data as continous
108 m_bVerbose = false;
109
110 m_pSelection = new CSG_Table;
111
112 m_pSelection->Add_Field("RANK" , SG_DATATYPE_Int ); // mRMR_SELFLD_RANK
113 m_pSelection->Add_Field("INDEX", SG_DATATYPE_Int ); // mRMR_SELFLD_INDEX
114 m_pSelection->Add_Field("NAME" , SG_DATATYPE_String); // mRMR_SELFLD_NAME
115 m_pSelection->Add_Field("SCORE", SG_DATATYPE_Double); // mRMR_SELFLD_SCORE
116 }
117
118 //---------------------------------------------------------
~CSG_mRMR(void)119 CSG_mRMR::~CSG_mRMR(void)
120 {
121 Destroy();
122
123 delete(m_pSelection);
124 }
125
126 //---------------------------------------------------------
Destroy(void)127 void CSG_mRMR::Destroy(void)
128 {
129 if( m_Samples )
130 {
131 DELETE_ARRAY(m_Samples[0]);
132 DELETE_ARRAY(m_Samples);
133 }
134
135 m_VarNames .Clear();
136 m_nSamples = 0;
137 m_nVars = 0;
138 m_bDiscretized = false; // initialize the data as continous
139
140 m_pSelection ->Del_Records();
141 }
142
143
144 ///////////////////////////////////////////////////////////
145 // //
146 ///////////////////////////////////////////////////////////
147
148 //---------------------------------------------------------
Get_Description(void)149 CSG_String CSG_mRMR::Get_Description(void)
150 {
151 return( _TW(
152 "The minimum Redundancy Maximum Relevance (mRMR) feature selection "
153 "algorithm has been developed by Hanchuan Peng <hanchuan.peng@gmail.com>.\n"
154 "\n"
155 "References:\n"
156 "Feature selection based on mutual information: criteria of max-dependency, max-relevance, and min-redundancy. "
157 "Hanchuan Peng, Fuhui Long, and Chris Ding, "
158 "IEEE Transactions on Pattern Analysis and Machine Intelligence, "
159 "Vol. 27, No. 8, pp.1226-1238, 2005.\n"
160 "\n"
161 "Minimum redundancy feature selection from microarray gene expression data,\n"
162 "Chris Ding, and Hanchuan Peng, "
163 "Journal of Bioinformatics and Computational Biology, "
164 "Vol. 3, No. 2, pp.185-205, 2005.\n"
165 "\n"
166 "Hanchuan Peng's mRMR Homepage at "
167 "<a target=\"_blank\" href=\"http://penglab.janelia.org/proj/mRMR/\">"
168 "http://penglab.janelia.org/proj/mRMR/</a>\n"
169 ));
170 }
171
172
173 ///////////////////////////////////////////////////////////
174 // //
175 ///////////////////////////////////////////////////////////
176
177 //---------------------------------------------------------
Parameters_Add(CSG_Parameters * pParameters,CSG_Parameter * pNode)178 bool CSG_mRMR::Parameters_Add(CSG_Parameters *pParameters, CSG_Parameter *pNode)
179 {
180 CSG_String ParentID(pNode ? pNode->Get_Identifier() : SG_T(""));
181
182 pParameters->Add_Int(
183 ParentID, "mRMR_NFEATURES" , _TL("Number of Features"),
184 _TL(""),
185 50, 1, true
186 );
187
188 pParameters->Add_Bool(
189 ParentID, "mRMR_DISCRETIZE" , _TL("Discretization"),
190 _TL("uncheck this means no discretizaton (i.e. data is already integer)"),
191 true
192 );
193
194 pParameters->Add_Double(
195 ParentID, "mRMR_THRESHOLD" , _TL("Discretization Threshold"),
196 _TL("a double number of the discretization threshold; set to 0 to make binarization"),
197 1.0, 0.0, true
198 );
199
200 pParameters->Add_Choice(
201 ParentID, "mRMR_METHOD" , _TL("Selection Method"),
202 _TL(""),
203 CSG_String::Format("%s|%s|",
204 _TL("Mutual Information Difference (MID)"),
205 _TL("Mutual Information Quotient (MIQ)")
206 ), 0
207 );
208
209 return( true );
210 }
211
212 //---------------------------------------------------------
Parameters_Enable(CSG_Parameters * pParameters,CSG_Parameter * pParameter)213 int CSG_mRMR::Parameters_Enable(CSG_Parameters *pParameters, CSG_Parameter *pParameter)
214 {
215 if( pParameter->Cmp_Identifier("mRMR_DISCRETIZE") )
216 {
217 pParameters->Set_Enabled("mRMR_THRESHOLD", pParameter->asBool());
218 }
219
220 return( 1 );
221 }
222
223 //---------------------------------------------------------
Set_Data(CSG_Table & Data,int ClassField,CSG_Parameters * pParameters)224 bool CSG_mRMR::Set_Data(CSG_Table &Data, int ClassField, CSG_Parameters *pParameters)
225 {
226 bool bDiscretize = (*pParameters)("mRMR_DISCRETIZE") ? (*pParameters)("mRMR_DISCRETIZE")->asBool () : true;
227 double Threshold = (*pParameters)("mRMR_THRESHOLD" ) ? (*pParameters)("mRMR_THRESHOLD" )->asDouble() : 1.0;
228
229 return( Set_Data(Data, ClassField, bDiscretize ? Threshold : -1.0) );
230 }
231
Set_Data(CSG_Matrix & Data,int ClassField,CSG_Parameters * pParameters)232 bool CSG_mRMR::Set_Data(CSG_Matrix &Data, int ClassField, CSG_Parameters *pParameters)
233 {
234 bool bDiscretize = (*pParameters)("mRMR_DISCRETIZE") ? (*pParameters)("mRMR_DISCRETIZE")->asBool () : true;
235 double Threshold = (*pParameters)("mRMR_THRESHOLD" ) ? (*pParameters)("mRMR_THRESHOLD" )->asDouble() : 1.0;
236
237 return( Set_Data(Data, ClassField, bDiscretize ? Threshold : -1.0) );
238 }
239
240 //---------------------------------------------------------
Get_Selection(CSG_Parameters * pParameters)241 bool CSG_mRMR::Get_Selection(CSG_Parameters *pParameters)
242 {
243 int nFeatures = (*pParameters)("mRMR_NFEATURES") ? (*pParameters)("mRMR_NFEATURES")->asInt() : 50;
244 int Method = (*pParameters)("mRMR_METHOD" ) ? (*pParameters)("mRMR_METHOD" )->asInt() : 0;
245
246 return( Get_Selection(nFeatures, Method) );
247 }
248
249
250 ///////////////////////////////////////////////////////////
251 // //
252 ///////////////////////////////////////////////////////////
253
254 //---------------------------------------------------------
Get_Count(void) const255 int CSG_mRMR::Get_Count (void) const
256 {
257 return( m_pSelection->Get_Count() );
258 }
259
Get_Index(int i) const260 int CSG_mRMR::Get_Index(int i) const
261 {
262 return( m_pSelection->Get_Record(i)->asInt (SG_mRMR_SELECTION_INDEX) );
263 }
264
Get_Name(int i) const265 CSG_String CSG_mRMR::Get_Name(int i) const
266 {
267 return( m_pSelection->Get_Record(i)->asString(SG_mRMR_SELECTION_NAME ) );
268 }
269
Get_Score(int i) const270 double CSG_mRMR::Get_Score(int i) const
271 {
272 return( m_pSelection->Get_Record(i)->asDouble(SG_mRMR_SELECTION_SCORE) );
273 }
274
275
276 ///////////////////////////////////////////////////////////
277 // //
278 ///////////////////////////////////////////////////////////
279
280 //---------------------------------------------------------
281 #define ADD_MESSAGE(Message) if( m_bVerbose ) SG_UI_Msg_Add_Execution(CSG_String(Message) + "\n", false);
282 #define ADD_WARNING(Message) if( m_bVerbose ) SG_UI_Msg_Add_Execution(_TL("Warning") + CSG_String(": ") + CSG_String(Message) + "\n", false, SG_UI_MSG_STYLE_FAILURE);
283
284
285 ///////////////////////////////////////////////////////////
286 // //
287 ///////////////////////////////////////////////////////////
288
289 //---------------------------------------------------------
Get_Memory(int nVars,int nSamples)290 bool CSG_mRMR::Get_Memory(int nVars, int nSamples)
291 {
292 Destroy();
293
294 //-----------------------------------------------------
295 m_nVars = nVars;
296
297 if( m_nVars <= 0 )
298 {
299 SG_UI_Msg_Add_Error("no features");
300
301 return( false );
302 }
303
304 m_nSamples = nSamples;
305
306 if( m_nSamples <= 0 )
307 {
308 SG_UI_Msg_Add_Error("no samples");
309
310 return( false );
311 }
312
313 //-----------------------------------------------------
314 m_Samples = new double *[m_nSamples];
315
316 if( !m_Samples )
317 {
318 SG_UI_Msg_Add_Error("failed to allocate memory.");
319
320 return( false );
321 }
322
323 m_Samples[0] = new double[m_nSamples * m_nVars];
324
325 if( !m_Samples[0] )
326 {
327 SG_UI_Msg_Add_Error("failed to allocate memory.");
328
329 return( false );
330 }
331
332 return( true );
333 }
334
335 //---------------------------------------------------------
Set_Data(CSG_Table & Data,int ClassField,double Threshold)336 bool CSG_mRMR::Set_Data(CSG_Table &Data, int ClassField, double Threshold)
337 {
338 if( !Get_Memory(Data.Get_Field_Count(), Data.Get_Count()) )
339 {
340 return( false );
341 }
342
343 //-----------------------------------------------------
344 if( ClassField < 0 || ClassField >= m_nVars )
345 {
346 ClassField = 0;
347 }
348
349 Data.Set_Index(ClassField, TABLE_INDEX_Ascending);
350
351 CSG_String s;
352
353 for(int iSample=0, Class=0; iSample<m_nSamples; iSample++)
354 {
355 double *pData = m_Samples[iSample] = m_Samples[0] + iSample * m_nVars;
356
357 if( s.Cmp(Data[iSample].asString(ClassField)) )
358 {
359 s = Data[iSample].asString(ClassField);
360
361 Class++;
362 }
363
364 *pData++ = Class;
365
366 for(int iVar=0; iVar<m_nVars; iVar++)
367 {
368 if( iVar != ClassField )
369 {
370 *pData++ = Data[iSample].asDouble(iVar);
371 }
372 }
373 }
374
375 Data.Del_Index();
376
377 m_VarNames += Data.Get_Field_Name(ClassField);
378
379 for(int iVar=0; iVar<m_nVars; iVar++)
380 {
381 if( iVar != ClassField )
382 {
383 m_VarNames += Data.Get_Field_Name(iVar);
384 }
385 }
386
387 //-----------------------------------------------------
388 if( Threshold >= 0.0 ) // discretization
389 {
390 Discretize(Threshold);
391 }
392
393 return( true );
394 }
395
396 //---------------------------------------------------------
Set_Data(CSG_Matrix & Data,int ClassField,double Threshold)397 bool CSG_mRMR::Set_Data(CSG_Matrix &Data, int ClassField, double Threshold)
398 {
399 if( !Get_Memory(Data.Get_NCols(), Data.Get_NRows()) )
400 {
401 return( false );
402 }
403
404 //-----------------------------------------------------
405 if( ClassField < 0 || ClassField >= m_nVars )
406 {
407 ClassField = 0;
408 }
409
410 for(int iSample=0; iSample<m_nSamples; iSample++)
411 {
412 double *pData = m_Samples[iSample] = m_Samples[0] + iSample * m_nVars;
413
414 *pData++ = Data[iSample][ClassField];
415
416 for(int iVar=0; iVar<m_nVars; iVar++)
417 {
418 if( iVar != ClassField )
419 {
420 *pData++ = Data[iSample][iVar];
421 }
422 }
423 }
424
425 m_VarNames += "CLASS";
426
427 for(int iVar=0; iVar<m_nVars; iVar++)
428 {
429 if( iVar != ClassField )
430 {
431 m_VarNames += CSG_String::Format(SG_T("FEATURE_%02d"), iVar);
432 }
433 }
434
435 //-----------------------------------------------------
436 if( Threshold >= 0.0 ) // discretization
437 {
438 Discretize(Threshold);
439 }
440
441 return( true );
442 }
443
444
445 ///////////////////////////////////////////////////////////
446 // //
447 ///////////////////////////////////////////////////////////
448
449 //---------------------------------------------------------
Discretize(double Threshold)450 bool CSG_mRMR::Discretize(double Threshold)
451 {
452 if( !m_Samples[0] || Threshold < 0.0 || m_bDiscretized )
453 {
454 return( false );
455 }
456
457 long i, j; // exclude the first column
458
459 //-----------------------------------------------------
460 for(j=1; j<m_nVars; j++) // z-score
461 {
462 double cursum = 0;
463 double curmean = 0;
464 double curstd = 0;
465
466 for(i=0; i<m_nSamples; i++)
467 {
468 cursum += m_Samples[i][j];
469 }
470
471 curmean = cursum / m_nSamples;
472 cursum = 0;
473
474 register double tmpf;
475
476 for(i=0; i<m_nSamples; i++)
477 {
478 tmpf = m_Samples[i][j] - curmean;
479 cursum += tmpf * tmpf;
480 }
481
482 curstd = (m_nSamples == 1) ? 0 : sqrt(cursum / (m_nSamples - 1)); // m_nSamples -1 is an unbiased version for Gaussian
483
484 for(i=0; i<m_nSamples; i++)
485 {
486 m_Samples[i][j] = (m_Samples[i][j] - curmean) / curstd;
487 }
488 }
489
490 //-----------------------------------------------------
491 for(j=1; j<m_nVars; j++)
492 {
493 register double tmpf;
494
495 for(i=0; i<m_nSamples; i++)
496 {
497 tmpf = m_Samples[i][j];
498
499 if( tmpf > Threshold )
500 {
501 tmpf = 1;
502 }
503 else if( tmpf < -Threshold )
504 {
505 tmpf = -1;
506 }
507 else
508 {
509 tmpf = 0;
510 }
511
512 m_Samples[i][j] = tmpf;
513 }
514 }
515
516 m_bDiscretized = true;
517
518 return( true );
519 }
520
521
522 ///////////////////////////////////////////////////////////
523 // //
524 ///////////////////////////////////////////////////////////
525
526 //---------------------------------------------------------
527 #define ADD_FEATURE(rank, score) {\
528 CSG_Table_Record *pFeature = m_pSelection->Add_Record();\
529 \
530 pFeature->Set_Value(SG_mRMR_SELECTION_RANK , rank + 1);\
531 pFeature->Set_Value(SG_mRMR_SELECTION_INDEX, feaInd[rank]);\
532 pFeature->Set_Value(SG_mRMR_SELECTION_NAME , m_VarNames[(size_t)feaInd[rank]]);\
533 pFeature->Set_Value(SG_mRMR_SELECTION_SCORE, score);\
534 \
535 ADD_MESSAGE(CSG_String::Format(SG_T("%d \t %d \t %s \t %5.3f"),\
536 rank + 1, feaInd[rank], m_VarNames[(size_t)feaInd[rank]].c_str(), score)\
537 );\
538 }
539
540 //---------------------------------------------------------
Pool_Compare(const void * a,const void * b)541 int CSG_mRMR::Pool_Compare(const void *a, const void *b)
542 {
543 if( ((TPool *)a)->mival < ((TPool *)b)->mival )
544 return( -1 );
545
546 if( ((TPool *)a)->mival > ((TPool *)b)->mival )
547 return( 1 );
548
549 return( 0 );
550 }
551
552 //---------------------------------------------------------
Get_Selection(int nFeatures,int Method)553 bool CSG_mRMR::Get_Selection(int nFeatures, int Method)
554 {
555 m_pSelection->Del_Records();
556
557 if( !m_Samples[0] )
558 {
559 SG_UI_Msg_Add_Error("The input data is NULL.");
560
561 return( false );
562 }
563
564 if( nFeatures < 0 )
565 {
566 SG_UI_Msg_Add_Error("The input number of features is negative.");
567
568 return( false );
569 }
570
571 long poolUseFeaLen = 500;
572 if( poolUseFeaLen > m_nVars - 1 ) // there is a target variable (the first one), that is why must remove one
573 poolUseFeaLen = m_nVars - 1;
574
575 if( nFeatures > poolUseFeaLen )
576 nFeatures = poolUseFeaLen;
577
578 long *feaInd = new long[nFeatures];
579
580 if( !feaInd )
581 {
582 SG_UI_Msg_Add_Error("Fail to allocate memory.");
583
584 return( false );
585 }
586
587 //-----------------------------------------------------
588 // determine the pool
589
590 TPool *Pool = (TPool *)SG_Malloc(m_nVars * sizeof(TPool));
591
592 if( !Pool )
593 {
594 SG_UI_Msg_Add_Error("Fail to allocate memory.");
595
596 return( false );
597 }
598
599 //-----------------------------------------------------
600 long i, j, k;
601
602 for(i=0; i<m_nVars; i++) // the Pool[0].mival is the entropy of target classification variable
603 {
604 Pool[i].mival = -Get_MutualInfo(0, i); // set as negative for sorting purpose
605 Pool[i].Index = i;
606 Pool[i].Mask = 1; // initialized to be everything in pool would be considered
607 }
608
609 qsort(Pool + 1, m_nVars - 1, sizeof(TPool), Pool_Compare);
610
611 Pool[0].mival = -Pool[0].mival;
612
613 ADD_MESSAGE(CSG_String::Format(
614 SG_T("\nTarget classification variable (#%d column in the input data) has name=%s \tentropy score=%5.3f"),
615 0 + 1, m_VarNames[0].c_str(), Pool[0].mival
616 ));
617
618 ADD_MESSAGE("\n*** MaxRel features ***");
619 ADD_MESSAGE("Order\tFea\tName\tScore");
620
621 for(i=1; i<m_nVars-1; i++)
622 {
623 Pool[i].mival = -Pool[i].mival;
624
625 if( i <= nFeatures )
626 {
627 ADD_MESSAGE(CSG_String::Format(SG_T("%d \t %d \t %s \t %5.3f"),
628 i, (int)Pool[i].Index, m_VarNames[(int)Pool[i].Index].c_str(), Pool[i].mival
629 ));
630 }
631 }
632
633 //-----------------------------------------------------
634 // mRMR selection
635
636 long poolFeaIndMin = 1;
637 long poolFeaIndMax = poolFeaIndMin + poolUseFeaLen - 1;
638
639 feaInd[0] = Pool[1].Index;
640 Pool[feaInd[0]].Mask = 0; // after selection, no longer consider this feature
641 Pool[0 ].Mask = 0; // of course the first one, which corresponds to the classification variable, should not be considered. Just set the mask as 0 for safety.
642
643 ADD_MESSAGE("\n*** mRMR features ***");
644 ADD_MESSAGE("Order\tFea\tName\tScore");
645
646 ADD_FEATURE(0, Pool[1].mival);
647
648 for(k=1; k<nFeatures; k++) //the first one, feaInd[0] has been determined already
649 {
650 double relevanceVal, redundancyVal, tmpscore, selectscore;
651 long selectind;
652 int b_firstSelected = 0;
653
654 for(i=poolFeaIndMin; i<=poolFeaIndMax; i++)
655 {
656 if( Pool[Pool[i].Index].Mask == 0 )
657 {
658 continue; // skip this feature as it was selected already
659 }
660
661 relevanceVal = Get_MutualInfo(0, Pool[i].Index); // actually no necessary to re-compute it, this value can be retrieved from Pool[].mival vector
662 redundancyVal = 0;
663
664 for(j=0; j<k; j++)
665 {
666 redundancyVal += Get_MutualInfo(feaInd[j], Pool[i].Index);
667 }
668
669 redundancyVal /= k;
670
671 switch( Method )
672 {
673 default: // fprintf(stderr, "Undefined selection method=%d. Use MID instead.\n", mymethod);
674 case SG_mRMR_Method_MID: tmpscore = relevanceVal - redundancyVal; break;
675 case SG_mRMR_Method_MIQ: tmpscore = relevanceVal / (redundancyVal + 0.0001); break;
676 }
677
678 if( b_firstSelected == 0 )
679 {
680 selectscore = tmpscore;
681 selectind = Pool[i].Index;
682 b_firstSelected = 1;
683 }
684 else if( tmpscore > selectscore )
685 { //update the best feature found and the score
686 selectscore = tmpscore;
687 selectind = Pool[i].Index;
688 }
689 }
690
691 feaInd[k] = selectind;
692 Pool[selectind].Mask = 0;
693
694 ADD_FEATURE(k, selectscore);
695 }
696
697 //-----------------------------------------------------
698 return( true );
699 }
700
701
702 ///////////////////////////////////////////////////////////
703 // //
704 ///////////////////////////////////////////////////////////
705
706 //---------------------------------------------------------
Get_MutualInfo(long v1,long v2)707 double CSG_mRMR::Get_MutualInfo(long v1, long v2)
708 {
709 double mi = -1; // initialized as an illegal value
710
711 if( !m_Samples[0] )
712 {
713 SG_UI_Msg_Add_Error("The input data is NULL.");
714
715 return mi;
716 }
717
718 if( v1 >= m_nVars || v2 >= m_nVars || v1 < 0 || v2 < 0 )
719 {
720 SG_UI_Msg_Add_Error("The input variable indexes are invalid (out of range).");
721
722 return mi;
723 }
724
725 //-----------------------------------------------------
726 // copy data
727
728 int *v1data = new int[m_nSamples];
729 int *v2data = new int[m_nSamples];
730
731 if( !v1data || !v2data )
732 {
733 SG_UI_Msg_Add_Error("Fail to allocate memory.");
734
735 return mi;
736 }
737
738 for(long i=0; i<m_nSamples; i++)
739 {
740 v1data[i] = (int)m_Samples[i][v1]; // the double already been discretized, should be safe now
741 v2data[i] = (int)m_Samples[i][v2];
742 }
743
744 //-----------------------------------------------------
745 // compute mutual info
746
747 long nstate = 3; // always true for DataTable, which was discretized as three states
748
749 int nstate1 = 0, nstate2 = 0;
750
751 double *pab = Get_JointProb(v1data, v2data, m_nSamples, nstate, nstate1, nstate2);
752
753 mi = Get_MutualInfo(pab, nstate1, nstate2);
754
755 //-----------------------------------------------------
756 // free memory and return
757
758 DELETE_ARRAY(v1data);
759 DELETE_ARRAY(v2data);
760 DELETE_ARRAY(pab );
761
762 return mi;
763 }
764
765 //---------------------------------------------------------
Get_MutualInfo(double * pab,long pabhei,long pabwid)766 double CSG_mRMR::Get_MutualInfo(double *pab, long pabhei, long pabwid)
767 {
768 //-----------------------------------------------------
769 // check if parameters are correct
770
771 if( !pab )
772 {
773 SG_UI_Msg_Add_Error("Got illeagal parameter in compute_mutualinfo().");
774
775 return( -1.0 );
776 }
777
778 //-----------------------------------------------------
779 long i, j;
780
781 double **pab2d = new double *[pabwid];
782
783 for(j=0; j<pabwid; j++)
784 {
785 pab2d[j] = pab + (long)j * pabhei;
786 }
787
788 double *p1 = 0, *p2 = 0;
789 long p1len = 0, p2len = 0;
790 int b_findmarginalprob = 1;
791
792 //-----------------------------------------------------
793 //generate marginal probability arrays
794
795 if( b_findmarginalprob != 0 )
796 {
797 p1len = pabhei;
798 p2len = pabwid;
799 p1 = new double[p1len];
800 p2 = new double[p2len];
801
802 for (i = 0; i < p1len; i++) p1[i] = 0;
803 for (j = 0; j < p2len; j++) p2[j] = 0;
804
805 for (i = 0; i < p1len; i++) //p1len = pabhei
806 {
807 for (j = 0; j < p2len; j++) //p2len = panwid
808 {
809 // printf("%f ",pab2d[j][i]);
810 p1[i] += pab2d[j][i];
811 p2[j] += pab2d[j][i];
812 }
813 }
814 }
815
816 //-----------------------------------------------------
817 // calculate the mutual information
818
819 double muInf = 0;
820
821 muInf = 0.0;
822
823 for (j = 0; j < pabwid; j++) // should for p2
824 {
825 for (i = 0; i < pabhei; i++) // should for p1
826 {
827 if (pab2d[j][i] != 0 && p1[i] != 0 && p2[j] != 0)
828 {
829 muInf += pab2d[j][i] * log (pab2d[j][i] / p1[i] / p2[j]);
830 // printf("%f %fab %fa %fb\n",muInf,pab2d[j][i]/p1[i]/p2[j],p1[i],p2[j]);
831 }
832 }
833 }
834
835 muInf /= log(2.0);
836
837 //-----------------------------------------------------
838 // free memory
839
840 DELETE_ARRAY(pab2d);
841
842 if( b_findmarginalprob != 0 )
843 {
844 DELETE_ARRAY(p1);
845 DELETE_ARRAY(p2);
846 }
847
848 return muInf;
849 }
850
851
852 ///////////////////////////////////////////////////////////
853 // //
854 ///////////////////////////////////////////////////////////
855
856 //---------------------------------------------------------
Get_JointProb(T * img1,T * img2,long len,long maxstatenum,int & nstate1,int & nstate2)857 template <class T> double * CSG_mRMR::Get_JointProb(T * img1, T * img2, long len, long maxstatenum, int &nstate1, int &nstate2)
858 {
859 long i, j;
860
861 //-----------------------------------------------------
862 // get and check size information
863
864 if (!img1 || !img2 || len < 0)
865 {
866 SG_UI_Msg_Add_Error("At least one of the input vectors is invalid.");
867
868 return( NULL );
869 }
870
871 // int b_findstatenum = 1; // int nstate1 = 0, nstate2 = 0;
872 int b_returnprob = 1;
873
874 //-----------------------------------------------------
875 // copy data into new INT type array (hence quantization) and then reange them begin from 0 (i.e. state1)
876
877 int *vec1 = new int[len];
878 int *vec2 = new int[len];
879
880 if( !vec1 || !vec2 )
881 {
882 SG_UI_Msg_Add_Error("Fail to allocate memory.\n");
883
884 return( NULL );
885 }
886
887 int nrealstate1 = 0, nrealstate2 = 0;
888
889 Copy_Vector(img1, len, vec1, nrealstate1);
890 Copy_Vector(img2, len, vec2, nrealstate2);
891
892 //update the #state when necessary
893 nstate1 = (nstate1 < nrealstate1) ? nrealstate1 : nstate1;
894 //printf("First vector #state = %i\n",nrealstate1);
895 nstate2 = (nstate2 < nrealstate2) ? nrealstate2 : nstate2;
896 //printf("Second vector #state = %i\n",nrealstate2);
897
898 // if (nstate1!=maxstatenum || nstate2!=maxstatenum)
899 // printf("find nstate1=%d nstate2=%d\n", nstate1, nstate2);
900
901 //-----------------------------------------------------
902 // generate the joint-distribution table
903
904 double *hab = new double[nstate1 * nstate2];
905 double **hab2d = new double *[nstate2];
906
907 if( !hab || !hab2d )
908 {
909 SG_UI_Msg_Add_Error("Fail to allocate memory.");
910
911 return( NULL );
912 }
913
914 for(j=0; j<nstate2; j++)
915 {
916 hab2d[j] = hab + (long)j * nstate1;
917 }
918
919 for(i=0; i<nstate1; i++)
920 {
921 for(j=0; j<nstate2; j++)
922 {
923 hab2d[j][i] = 0;
924 }
925 }
926
927 for(i=0; i<len; i++)
928 {
929 // old method -- slow
930 // indx = (long)(vec2[i]) * nstate1 + vec1[i];
931 // hab[indx] += 1;
932
933 // new method -- fast
934 hab2d[vec2[i]][vec1[i]] += 1;
935 //printf("vec2[%d]=%d vec1[%d]=%d\n", i, vec2[i], i, vec1[i]);
936 }
937
938 //-----------------------------------------------------
939 // return the probabilities, otherwise return count numbers
940
941 if( b_returnprob )
942 {
943 for(i=0; i<nstate1; i++)
944 {
945 for(j=0; j<nstate2; j++)
946 {
947 hab2d[j][i] /= len;
948 }
949 }
950 }
951
952 //-----------------------------------------------------
953 // finish
954
955 DELETE_ARRAY(hab2d);
956 DELETE_ARRAY(vec1);
957 DELETE_ARRAY(vec2);
958
959 return hab;
960 }
961
962 //---------------------------------------------------------
Copy_Vector(T * srcdata,long len,int * desdata,int & nstate)963 template <class T> void CSG_mRMR::Copy_Vector(T *srcdata, long len, int *desdata, int &nstate)
964 {
965 if (!srcdata || !desdata)
966 {
967 SG_UI_Msg_Add_Error("no points in Copy_Vector()!");
968
969 return;
970 }
971
972 // note: originally I added 0.5 before rounding, however seems the negative numbers and
973 // positive numbers are all rounded towarded 0; hence int(-1+0.5)=0 and int(1+0.5)=1;
974 // This is unwanted because I need the above to be -1 and 1.
975 // for this reason I just round with 0.5 adjustment for positive and negative differently
976
977 // copy data
978 int minn, maxx;
979 if (srcdata[0] > 0)
980 {
981 maxx = minn = int (srcdata[0] + 0.5);
982 }
983 else
984 {
985 maxx = minn = int (srcdata[0] - 0.5);
986 }
987
988 long i;
989 int tmp;
990 double tmp1;
991
992 for (i = 0; i < len; i++)
993 {
994 tmp1 = double (srcdata[i]);
995 tmp = (tmp1 > 0) ? (int) (tmp1 + 0.5) : (int) (tmp1 - 0.5); //round to integers
996 minn = (minn < tmp) ? minn : tmp;
997 maxx = (maxx > tmp) ? maxx : tmp;
998 desdata[i] = tmp;
999 // printf("%i ",desdata[i]);
1000 }
1001
1002 // make the vector data begin from 0 (i.e. 1st state)
1003 for (i = 0; i < len; i++)
1004 {
1005 desdata[i] -= minn;
1006 }
1007
1008 // return the #state
1009 nstate = (maxx - minn + 1);
1010
1011 return;
1012 }
1013
1014
1015 ///////////////////////////////////////////////////////////
1016 // //
1017 // //
1018 // //
1019 ///////////////////////////////////////////////////////////
1020
1021 //---------------------------------------------------------
1022 #endif // WITH_MRMR
1023