1 
2 ///////////////////////////////////////////////////////////
3 //                                                       //
4 //                         SAGA                          //
5 //                                                       //
6 //      System for Automated Geoscientific Analyses      //
7 //                                                       //
8 //           Application Programming Interface           //
9 //                                                       //
10 //                  Library: SAGA_API                    //
11 //                                                       //
12 //-------------------------------------------------------//
13 //                                                       //
14 //                       mRMR.cpp                        //
15 //                                                       //
16 //                 Copyright (C) 2014 by                 //
17 //                      Olaf Conrad                      //
18 //                                                       //
19 //-------------------------------------------------------//
20 //                                                       //
21 // This file is part of 'SAGA - System for Automated     //
22 // Geoscientific Analyses'.                              //
23 //                                                       //
24 // This library is free software; you can redistribute   //
25 // it and/or modify it under the terms of the GNU Lesser //
26 // General Public License as published by the Free       //
27 // Software Foundation, either version 2.1 of the        //
28 // License, or (at your option) any later version.       //
29 //                                                       //
30 // This library is distributed in the hope that it will  //
31 // be useful, but WITHOUT ANY WARRANTY; without even the //
32 // implied warranty of MERCHANTABILITY or FITNESS FOR A  //
33 // PARTICULAR PURPOSE. See the GNU Lesser General Public //
34 // License for more details.                             //
35 //                                                       //
36 // You should have received a copy of the GNU Lesser     //
37 // General Public License along with this program; if    //
38 // not, see <http://www.gnu.org/licenses/>.              //
39 //                                                       //
40 //-------------------------------------------------------//
41 //                                                       //
42 //    e-mail:     oconrad@saga-gis.org                   //
43 //                                                       //
44 //    contact:    Olaf Conrad                            //
45 //                Institute of Geography                 //
46 //                University of Hamburg                  //
47 //                Germany                                //
48 //                                                       //
49 ///////////////////////////////////////////////////////////
50 
51 //---------------------------------------------------------
52 #ifdef WITH_MRMR
53 
54 //---------------------------------------------------------
55 // A C++ program to implement the mRMR selection using mutual information
56 // written by Hanchuan Peng.
57 //
58 // Disclaimer: The author of program is Hanchuan Peng
59 //      at <penghanchuan@yahoo.com>.
60 //
61 // The CopyRight is reserved by the author.
62 //
63 // Last modification: Jan/25/2007
64 //
65 // by Hanchuan Peng
66 // 2005-10-24: finalize the computing parts of the whole program
67 // 2005-10-25: add non-discretization for the classification variable. Also slightly change some output info for the web application
68 // 2005-11-01: add control to the user-defined max variable number and sample number
69 // 2006-01-26: add gnu_getline.c to convert the code to be compilable under Max OS.
70 // 2007-01-25: change the address info
71 
72 //---------------------------------------------------------
73 #include "mat_tools.h"
74 #include "parameters.h"
75 #include "table.h"
76 
77 
78 ///////////////////////////////////////////////////////////
79 //														 //
80 //														 //
81 //														 //
82 ///////////////////////////////////////////////////////////
83 
84 //---------------------------------------------------------
85 #define DELETE_ARRAY(p)	if( p )	{	delete[]p;	p	= NULL;	}
86 
87 //---------------------------------------------------------
88 enum ESG_mRMR_Selection
89 {
90 	SG_mRMR_SELECTION_RANK	= 0,
91 	SG_mRMR_SELECTION_INDEX,
92 	SG_mRMR_SELECTION_NAME,
93 	SG_mRMR_SELECTION_SCORE
94 };
95 
96 
97 ///////////////////////////////////////////////////////////
98 //														 //
99 ///////////////////////////////////////////////////////////
100 
101 //---------------------------------------------------------
CSG_mRMR(void)102 CSG_mRMR::CSG_mRMR(void)
103 {
104 	m_Samples		= NULL;
105 	m_nSamples		= 0;
106 	m_nVars			= 0;
107 	m_bDiscretized	= false;	// initialize the data as continous
108 	m_bVerbose		= false;
109 
110 	m_pSelection	= new CSG_Table;
111 
112 	m_pSelection->Add_Field("RANK" , SG_DATATYPE_Int   );	// mRMR_SELFLD_RANK
113 	m_pSelection->Add_Field("INDEX", SG_DATATYPE_Int   );	// mRMR_SELFLD_INDEX
114 	m_pSelection->Add_Field("NAME" , SG_DATATYPE_String);	// mRMR_SELFLD_NAME
115 	m_pSelection->Add_Field("SCORE", SG_DATATYPE_Double);	// mRMR_SELFLD_SCORE
116 }
117 
118 //---------------------------------------------------------
~CSG_mRMR(void)119 CSG_mRMR::~CSG_mRMR(void)
120 {
121 	Destroy();
122 
123 	delete(m_pSelection);
124 }
125 
126 //---------------------------------------------------------
Destroy(void)127 void CSG_mRMR::Destroy(void)
128 {
129 	if( m_Samples )
130 	{
131 		DELETE_ARRAY(m_Samples[0]);
132 		DELETE_ARRAY(m_Samples);
133 	}
134 
135 	m_VarNames		.Clear();
136 	m_nSamples		= 0;
137 	m_nVars			= 0;
138 	m_bDiscretized	= false;	// initialize the data as continous
139 
140 	m_pSelection	->Del_Records();
141 }
142 
143 
144 ///////////////////////////////////////////////////////////
145 //														 //
146 ///////////////////////////////////////////////////////////
147 
148 //---------------------------------------------------------
Get_Description(void)149 CSG_String CSG_mRMR::Get_Description(void)
150 {
151 	return( _TW(
152 		"The minimum Redundancy Maximum Relevance (mRMR) feature selection "
153 		"algorithm has been developed by Hanchuan Peng <hanchuan.peng@gmail.com>.\n"
154 		"\n"
155 		"References:\n"
156 		"Feature selection based on mutual information: criteria of max-dependency, max-relevance, and min-redundancy. "
157 		"Hanchuan Peng, Fuhui Long, and Chris Ding, "
158 		"IEEE Transactions on Pattern Analysis and Machine Intelligence, "
159 		"Vol. 27, No. 8, pp.1226-1238, 2005.\n"
160 		"\n"
161 		"Minimum redundancy feature selection from microarray gene expression data,\n"
162 		"Chris Ding, and Hanchuan Peng, "
163 		"Journal of Bioinformatics and Computational Biology, "
164 		"Vol. 3, No. 2, pp.185-205, 2005.\n"
165 		"\n"
166 		"Hanchuan Peng's mRMR Homepage at "
167 		"<a target=\"_blank\" href=\"http://penglab.janelia.org/proj/mRMR/\">"
168 		"http://penglab.janelia.org/proj/mRMR/</a>\n"
169 	));
170 }
171 
172 
173 ///////////////////////////////////////////////////////////
174 //														 //
175 ///////////////////////////////////////////////////////////
176 
177 //---------------------------------------------------------
Parameters_Add(CSG_Parameters * pParameters,CSG_Parameter * pNode)178 bool CSG_mRMR::Parameters_Add(CSG_Parameters *pParameters, CSG_Parameter *pNode)
179 {
180 	CSG_String	ParentID(pNode ? pNode->Get_Identifier() : SG_T(""));
181 
182 	pParameters->Add_Int(
183 		ParentID, "mRMR_NFEATURES"	, _TL("Number of Features"),
184 		_TL(""),
185 		50, 1, true
186 	);
187 
188 	pParameters->Add_Bool(
189 		ParentID, "mRMR_DISCRETIZE"	, _TL("Discretization"),
190 		_TL("uncheck this means no discretizaton (i.e. data is already integer)"),
191 		true
192 	);
193 
194 	pParameters->Add_Double(
195 		ParentID, "mRMR_THRESHOLD"	, _TL("Discretization Threshold"),
196 		_TL("a double number of the discretization threshold; set to 0 to make binarization"),
197 		1.0, 0.0, true
198 	);
199 
200 	pParameters->Add_Choice(
201 		ParentID, "mRMR_METHOD"		, _TL("Selection Method"),
202 		_TL(""),
203 		CSG_String::Format("%s|%s|",
204 			_TL("Mutual Information Difference (MID)"),
205 			_TL("Mutual Information Quotient (MIQ)")
206 		), 0
207 	);
208 
209 	return( true );
210 }
211 
212 //---------------------------------------------------------
Parameters_Enable(CSG_Parameters * pParameters,CSG_Parameter * pParameter)213 int CSG_mRMR::Parameters_Enable(CSG_Parameters *pParameters, CSG_Parameter *pParameter)
214 {
215 	if( pParameter->Cmp_Identifier("mRMR_DISCRETIZE") )
216 	{
217 		pParameters->Set_Enabled("mRMR_THRESHOLD", pParameter->asBool());
218 	}
219 
220 	return( 1 );
221 }
222 
223 //---------------------------------------------------------
Set_Data(CSG_Table & Data,int ClassField,CSG_Parameters * pParameters)224 bool CSG_mRMR::Set_Data(CSG_Table &Data, int ClassField, CSG_Parameters *pParameters)
225 {
226 	bool	bDiscretize	= (*pParameters)("mRMR_DISCRETIZE") ? (*pParameters)("mRMR_DISCRETIZE")->asBool  () : true;
227 	double	Threshold	= (*pParameters)("mRMR_THRESHOLD" ) ? (*pParameters)("mRMR_THRESHOLD" )->asDouble() : 1.0;
228 
229 	return( Set_Data(Data, ClassField, bDiscretize ? Threshold : -1.0) );
230 }
231 
Set_Data(CSG_Matrix & Data,int ClassField,CSG_Parameters * pParameters)232 bool CSG_mRMR::Set_Data(CSG_Matrix &Data, int ClassField, CSG_Parameters *pParameters)
233 {
234 	bool	bDiscretize	= (*pParameters)("mRMR_DISCRETIZE") ? (*pParameters)("mRMR_DISCRETIZE")->asBool  () : true;
235 	double	Threshold	= (*pParameters)("mRMR_THRESHOLD" ) ? (*pParameters)("mRMR_THRESHOLD" )->asDouble() : 1.0;
236 
237 	return( Set_Data(Data, ClassField, bDiscretize ? Threshold : -1.0) );
238 }
239 
240 //---------------------------------------------------------
Get_Selection(CSG_Parameters * pParameters)241 bool CSG_mRMR::Get_Selection(CSG_Parameters *pParameters)
242 {
243 	int		nFeatures	= (*pParameters)("mRMR_NFEATURES") ? (*pParameters)("mRMR_NFEATURES")->asInt() : 50;
244 	int		Method		= (*pParameters)("mRMR_METHOD"   ) ? (*pParameters)("mRMR_METHOD"   )->asInt() : 0;
245 
246 	return( Get_Selection(nFeatures, Method) );
247 }
248 
249 
250 ///////////////////////////////////////////////////////////
251 //														 //
252 ///////////////////////////////////////////////////////////
253 
254 //---------------------------------------------------------
Get_Count(void) const255 int CSG_mRMR::Get_Count	(void) const
256 {
257 	return( m_pSelection->Get_Count() );
258 }
259 
Get_Index(int i) const260 int CSG_mRMR::Get_Index(int i) const
261 {
262 	return( m_pSelection->Get_Record(i)->asInt   (SG_mRMR_SELECTION_INDEX) );
263 }
264 
Get_Name(int i) const265 CSG_String CSG_mRMR::Get_Name(int i) const
266 {
267 	return( m_pSelection->Get_Record(i)->asString(SG_mRMR_SELECTION_NAME ) );
268 }
269 
Get_Score(int i) const270 double CSG_mRMR::Get_Score(int i) const
271 {
272 	return( m_pSelection->Get_Record(i)->asDouble(SG_mRMR_SELECTION_SCORE) );
273 }
274 
275 
276 ///////////////////////////////////////////////////////////
277 //														 //
278 ///////////////////////////////////////////////////////////
279 
280 //---------------------------------------------------------
281 #define ADD_MESSAGE(Message)	if( m_bVerbose ) SG_UI_Msg_Add_Execution(CSG_String(Message) + "\n", false);
282 #define ADD_WARNING(Message)	if( m_bVerbose ) SG_UI_Msg_Add_Execution(_TL("Warning") + CSG_String(": ") + CSG_String(Message) + "\n", false, SG_UI_MSG_STYLE_FAILURE);
283 
284 
285 ///////////////////////////////////////////////////////////
286 //														 //
287 ///////////////////////////////////////////////////////////
288 
289 //---------------------------------------------------------
Get_Memory(int nVars,int nSamples)290 bool CSG_mRMR::Get_Memory(int nVars, int nSamples)
291 {
292 	Destroy();
293 
294 	//-----------------------------------------------------
295 	m_nVars		= nVars;
296 
297 	if( m_nVars <= 0 )
298 	{
299 		SG_UI_Msg_Add_Error("no features");
300 
301 		return( false );
302 	}
303 
304 	m_nSamples	= nSamples;
305 
306 	if( m_nSamples <= 0 )
307 	{
308 		SG_UI_Msg_Add_Error("no samples");
309 
310 		return( false );
311 	}
312 
313 	//-----------------------------------------------------
314 	m_Samples		= new double *[m_nSamples];
315 
316 	if( !m_Samples )
317 	{
318 		SG_UI_Msg_Add_Error("failed to allocate memory.");
319 
320 		return( false );
321 	}
322 
323 	m_Samples[0]	= new double[m_nSamples * m_nVars];
324 
325 	if( !m_Samples[0] )
326 	{
327 		SG_UI_Msg_Add_Error("failed to allocate memory.");
328 
329 		return( false );
330 	}
331 
332 	return( true );
333 }
334 
335 //---------------------------------------------------------
Set_Data(CSG_Table & Data,int ClassField,double Threshold)336 bool CSG_mRMR::Set_Data(CSG_Table &Data, int ClassField, double Threshold)
337 {
338 	if( !Get_Memory(Data.Get_Field_Count(), Data.Get_Count()) )
339 	{
340 		return( false );
341 	}
342 
343 	//-----------------------------------------------------
344 	if( ClassField < 0 || ClassField >= m_nVars )
345 	{
346 		ClassField	= 0;
347 	}
348 
349 	Data.Set_Index(ClassField, TABLE_INDEX_Ascending);
350 
351 	CSG_String	s;
352 
353 	for(int iSample=0, Class=0; iSample<m_nSamples; iSample++)
354 	{
355 		double	*pData	= m_Samples[iSample] = m_Samples[0] + iSample * m_nVars;
356 
357 		if( s.Cmp(Data[iSample].asString(ClassField)) )
358 		{
359 			s	= Data[iSample].asString(ClassField);
360 
361 			Class++;
362 		}
363 
364 		*pData++	= Class;
365 
366 		for(int iVar=0; iVar<m_nVars; iVar++)
367 		{
368 			if( iVar != ClassField )
369 			{
370 				*pData++	= Data[iSample].asDouble(iVar);
371 			}
372 		}
373 	}
374 
375 	Data.Del_Index();
376 
377 	m_VarNames	+= Data.Get_Field_Name(ClassField);
378 
379 	for(int iVar=0; iVar<m_nVars; iVar++)
380 	{
381 		if( iVar != ClassField )
382 		{
383 			m_VarNames	+= Data.Get_Field_Name(iVar);
384 		}
385 	}
386 
387 	//-----------------------------------------------------
388 	if( Threshold >= 0.0 )	// discretization
389 	{
390 		Discretize(Threshold);
391 	}
392 
393 	return( true );
394 }
395 
396 //---------------------------------------------------------
Set_Data(CSG_Matrix & Data,int ClassField,double Threshold)397 bool CSG_mRMR::Set_Data(CSG_Matrix &Data, int ClassField, double Threshold)
398 {
399 	if( !Get_Memory(Data.Get_NCols(), Data.Get_NRows()) )
400 	{
401 		return( false );
402 	}
403 
404 	//-----------------------------------------------------
405 	if( ClassField < 0 || ClassField >= m_nVars )
406 	{
407 		ClassField	= 0;
408 	}
409 
410 	for(int iSample=0; iSample<m_nSamples; iSample++)
411 	{
412 		double	*pData	= m_Samples[iSample] = m_Samples[0] + iSample * m_nVars;
413 
414 		*pData++	= Data[iSample][ClassField];
415 
416 		for(int iVar=0; iVar<m_nVars; iVar++)
417 		{
418 			if( iVar != ClassField )
419 			{
420 				*pData++	= Data[iSample][iVar];
421 			}
422 		}
423 	}
424 
425 	m_VarNames	+= "CLASS";
426 
427 	for(int iVar=0; iVar<m_nVars; iVar++)
428 	{
429 		if( iVar != ClassField )
430 		{
431 			m_VarNames	+= CSG_String::Format(SG_T("FEATURE_%02d"), iVar);
432 		}
433 	}
434 
435 	//-----------------------------------------------------
436 	if( Threshold >= 0.0 )	// discretization
437 	{
438 		Discretize(Threshold);
439 	}
440 
441 	return( true );
442 }
443 
444 
445 ///////////////////////////////////////////////////////////
446 //														 //
447 ///////////////////////////////////////////////////////////
448 
449 //---------------------------------------------------------
Discretize(double Threshold)450 bool CSG_mRMR::Discretize(double Threshold)
451 {
452 	if( !m_Samples[0] || Threshold < 0.0 || m_bDiscretized )
453 	{
454 		return( false );
455 	}
456 
457 	long	i, j;	// exclude the first column
458 
459 	//-----------------------------------------------------
460 	for(j=1; j<m_nVars; j++)	// z-score
461 	{
462 		double cursum	= 0;
463 		double curmean	= 0;
464 		double curstd	= 0;
465 
466 		for(i=0; i<m_nSamples; i++)
467 		{
468 			cursum	+= m_Samples[i][j];
469 		}
470 
471 		curmean	= cursum / m_nSamples;
472 		cursum	= 0;
473 
474 		register double tmpf;
475 
476 		for(i=0; i<m_nSamples; i++)
477 		{
478 			tmpf	= m_Samples[i][j] - curmean;
479 			cursum += tmpf * tmpf;
480 		}
481 
482 		curstd	= (m_nSamples == 1) ? 0 : sqrt(cursum / (m_nSamples - 1));	// m_nSamples -1 is an unbiased version for Gaussian
483 
484 		for(i=0; i<m_nSamples; i++)
485 		{
486 			m_Samples[i][j]	= (m_Samples[i][j] - curmean) / curstd;
487 		}
488 	}
489 
490 	//-----------------------------------------------------
491 	for(j=1; j<m_nVars; j++)
492 	{
493 		register double tmpf;
494 
495 		for(i=0; i<m_nSamples; i++)
496 		{
497 			tmpf	= m_Samples[i][j];
498 
499 			if( tmpf > Threshold )
500 			{
501 				tmpf	=  1;
502 			}
503 			else if( tmpf < -Threshold )
504 			{
505 				tmpf	= -1;
506 			}
507 			else
508 			{
509 				tmpf	=  0;
510 			}
511 
512 			m_Samples[i][j]	= tmpf;
513 		}
514 	}
515 
516 	m_bDiscretized	= true;
517 
518 	return( true );
519 }
520 
521 
522 ///////////////////////////////////////////////////////////
523 //														 //
524 ///////////////////////////////////////////////////////////
525 
526 //---------------------------------------------------------
527 #define ADD_FEATURE(rank, score)	{\
528 	CSG_Table_Record	*pFeature	= m_pSelection->Add_Record();\
529 	\
530 	pFeature->Set_Value(SG_mRMR_SELECTION_RANK , rank + 1);\
531 	pFeature->Set_Value(SG_mRMR_SELECTION_INDEX, feaInd[rank]);\
532 	pFeature->Set_Value(SG_mRMR_SELECTION_NAME , m_VarNames[(size_t)feaInd[rank]]);\
533 	pFeature->Set_Value(SG_mRMR_SELECTION_SCORE, score);\
534 	\
535 	ADD_MESSAGE(CSG_String::Format(SG_T("%d \t %d \t %s \t %5.3f"),\
536 		rank + 1, feaInd[rank], m_VarNames[(size_t)feaInd[rank]].c_str(), score)\
537 	);\
538 }
539 
540 //---------------------------------------------------------
Pool_Compare(const void * a,const void * b)541 int CSG_mRMR::Pool_Compare(const void *a, const void *b)
542 {
543 	if( ((TPool *)a)->mival < ((TPool *)b)->mival )
544 		return( -1 );
545 
546 	if( ((TPool *)a)->mival > ((TPool *)b)->mival )
547 		return(  1 );
548 
549 	return( 0 );
550 }
551 
552 //---------------------------------------------------------
Get_Selection(int nFeatures,int Method)553 bool CSG_mRMR::Get_Selection(int nFeatures, int Method)
554 {
555 	m_pSelection->Del_Records();
556 
557 	if( !m_Samples[0] )
558 	{
559 		SG_UI_Msg_Add_Error("The input data is NULL.");
560 
561 		return( false );
562 	}
563 
564 	if( nFeatures < 0 )
565 	{
566 		SG_UI_Msg_Add_Error("The input number of features is negative.");
567 
568 		return( false );
569 	}
570 
571 	long poolUseFeaLen = 500;
572 	if( poolUseFeaLen > m_nVars - 1 )	// there is a target variable (the first one), that is why must remove one
573 		poolUseFeaLen = m_nVars - 1;
574 
575 	if( nFeatures > poolUseFeaLen )
576 		nFeatures = poolUseFeaLen;
577 
578 	long	*feaInd	= new long[nFeatures];
579 
580 	if( !feaInd )
581 	{
582 		SG_UI_Msg_Add_Error("Fail to allocate memory.");
583 
584 		return( false );
585 	}
586 
587 	//-----------------------------------------------------
588 	// determine the pool
589 
590 	TPool	*Pool	= (TPool *)SG_Malloc(m_nVars * sizeof(TPool));
591 
592 	if( !Pool )
593 	{
594 		SG_UI_Msg_Add_Error("Fail to allocate memory.");
595 
596 		return( false );
597 	}
598 
599 	//-----------------------------------------------------
600 	long	i, j, k;
601 
602 	for(i=0; i<m_nVars; i++)	// the Pool[0].mival is the entropy of target classification variable
603 	{
604 		Pool[i].mival	= -Get_MutualInfo(0, i);	// set as negative for sorting purpose
605 		Pool[i].Index	= i;
606 		Pool[i].Mask	= 1;	// initialized to be everything in pool would be considered
607 	}
608 
609 	qsort(Pool + 1, m_nVars - 1, sizeof(TPool), Pool_Compare);
610 
611 	Pool[0].mival	= -Pool[0].mival;
612 
613 	ADD_MESSAGE(CSG_String::Format(
614 		SG_T("\nTarget classification variable (#%d column in the input data) has name=%s \tentropy score=%5.3f"),
615 		0 + 1, m_VarNames[0].c_str(), Pool[0].mival
616 	));
617 
618 	ADD_MESSAGE("\n*** MaxRel features ***");
619 	ADD_MESSAGE("Order\tFea\tName\tScore");
620 
621 	for(i=1; i<m_nVars-1; i++)
622 	{
623 		Pool[i].mival	= -Pool[i].mival;
624 
625 		if( i <= nFeatures )
626 		{
627 			ADD_MESSAGE(CSG_String::Format(SG_T("%d \t %d \t %s \t %5.3f"),
628 				i, (int)Pool[i].Index, m_VarNames[(int)Pool[i].Index].c_str(), Pool[i].mival
629 			));
630 		}
631 	}
632 
633 	//-----------------------------------------------------
634 	// mRMR selection
635 
636 	long	poolFeaIndMin	= 1;
637 	long	poolFeaIndMax	= poolFeaIndMin + poolUseFeaLen - 1;
638 
639 	feaInd[0]	= Pool[1].Index;
640 	Pool[feaInd[0]].Mask	= 0;	// after selection, no longer consider this feature
641 	Pool[0        ].Mask	= 0;	// of course the first one, which corresponds to the classification variable, should not be considered. Just set the mask as 0 for safety.
642 
643 	ADD_MESSAGE("\n*** mRMR features ***");
644 	ADD_MESSAGE("Order\tFea\tName\tScore");
645 
646 	ADD_FEATURE(0, Pool[1].mival);
647 
648 	for(k=1; k<nFeatures; k++)	//the first one, feaInd[0] has been determined already
649 	{
650 		double	relevanceVal, redundancyVal, tmpscore, selectscore;
651 		long	selectind;
652 		int		b_firstSelected	= 0;
653 
654 		for(i=poolFeaIndMin; i<=poolFeaIndMax; i++)
655 		{
656 			if( Pool[Pool[i].Index].Mask == 0 )
657 			{
658 				continue;	// skip this feature as it was selected already
659 			}
660 
661 			relevanceVal	= Get_MutualInfo(0, Pool[i].Index);	// actually no necessary to re-compute it, this value can be retrieved from Pool[].mival vector
662 			redundancyVal	= 0;
663 
664 			for(j=0; j<k; j++)
665 			{
666 				redundancyVal	+= Get_MutualInfo(feaInd[j], Pool[i].Index);
667 			}
668 
669 			redundancyVal /= k;
670 
671 			switch( Method )
672 			{
673 			default:	// fprintf(stderr, "Undefined selection method=%d. Use MID instead.\n", mymethod);
674 			case SG_mRMR_Method_MID:	tmpscore	= relevanceVal - redundancyVal;				break;
675 			case SG_mRMR_Method_MIQ:	tmpscore	= relevanceVal / (redundancyVal + 0.0001);	break;
676 			}
677 
678 			if( b_firstSelected == 0 )
679 			{
680 				selectscore		= tmpscore;
681 				selectind		= Pool[i].Index;
682 				b_firstSelected	= 1;
683 			}
684 			else if( tmpscore > selectscore )
685 			{	//update the best feature found and the score
686 				selectscore		= tmpscore;
687 				selectind		= Pool[i].Index;
688 			}
689 		}
690 
691 		feaInd[k]	= selectind;
692 		Pool[selectind].Mask	= 0;
693 
694 		ADD_FEATURE(k, selectscore);
695 	}
696 
697 	//-----------------------------------------------------
698 	return( true );
699 }
700 
701 
702 ///////////////////////////////////////////////////////////
703 //														 //
704 ///////////////////////////////////////////////////////////
705 
706 //---------------------------------------------------------
Get_MutualInfo(long v1,long v2)707 double CSG_mRMR::Get_MutualInfo(long v1, long v2)
708 {
709 	double	mi	= -1;	// initialized as an illegal value
710 
711 	if( !m_Samples[0] )
712 	{
713 		SG_UI_Msg_Add_Error("The input data is NULL.");
714 
715 		return mi;
716 	}
717 
718 	if( v1 >= m_nVars || v2 >= m_nVars || v1 < 0 || v2 < 0 )
719 	{
720 		SG_UI_Msg_Add_Error("The input variable indexes are invalid (out of range).");
721 
722 		return mi;
723 	}
724 
725 	//-----------------------------------------------------
726 	// copy data
727 
728 	int	*v1data	= new int[m_nSamples];
729 	int	*v2data	= new int[m_nSamples];
730 
731 	if( !v1data || !v2data )
732 	{
733 		SG_UI_Msg_Add_Error("Fail to allocate memory.");
734 
735 		return mi;
736 	}
737 
738 	for(long i=0; i<m_nSamples; i++)
739 	{
740 		v1data[i]	= (int)m_Samples[i][v1];	// the double already been discretized, should be safe now
741 		v2data[i]	= (int)m_Samples[i][v2];
742 	}
743 
744 	//-----------------------------------------------------
745 	// compute mutual info
746 
747 	long	nstate	= 3;	// always true for DataTable, which was discretized as three states
748 
749 	int		nstate1 = 0, nstate2 = 0;
750 
751 	double	*pab	= Get_JointProb(v1data, v2data, m_nSamples, nstate, nstate1, nstate2);
752 
753 	mi	= Get_MutualInfo(pab, nstate1, nstate2);
754 
755 	//-----------------------------------------------------
756 	// free memory and return
757 
758 	DELETE_ARRAY(v1data);
759 	DELETE_ARRAY(v2data);
760 	DELETE_ARRAY(pab   );
761 
762 	return mi;
763 }
764 
765 //---------------------------------------------------------
Get_MutualInfo(double * pab,long pabhei,long pabwid)766 double CSG_mRMR::Get_MutualInfo(double *pab, long pabhei, long pabwid)
767 {
768 	//-----------------------------------------------------
769 	// check if parameters are correct
770 
771 	if( !pab )
772 	{
773 		SG_UI_Msg_Add_Error("Got illeagal parameter in compute_mutualinfo().");
774 
775 		return( -1.0 );
776 	}
777 
778 	//-----------------------------------------------------
779 	long i, j;
780 
781 	double	**pab2d	= new double *[pabwid];
782 
783 	for(j=0; j<pabwid; j++)
784 	{
785 		pab2d[j]	= pab + (long)j * pabhei;
786 	}
787 
788 	double	*p1 = 0, *p2 = 0;
789 	long	p1len = 0, p2len = 0;
790 	int		b_findmarginalprob = 1;
791 
792 	//-----------------------------------------------------
793 	//generate marginal probability arrays
794 
795 	if( b_findmarginalprob != 0 )
796 	{
797 		p1len	= pabhei;
798 		p2len	= pabwid;
799 		p1		= new double[p1len];
800 		p2		= new double[p2len];
801 
802 		for (i = 0; i < p1len; i++)	p1[i] = 0;
803 		for (j = 0; j < p2len; j++)	p2[j] = 0;
804 
805 		for (i = 0; i < p1len; i++)	//p1len = pabhei
806 		{
807 			for (j = 0; j < p2len; j++)	//p2len = panwid
808 			{
809 			//	printf("%f ",pab2d[j][i]);
810 				p1[i] += pab2d[j][i];
811 				p2[j] += pab2d[j][i];
812 			}
813 		}
814 	}
815 
816 	//-----------------------------------------------------
817 	// calculate the mutual information
818 
819 	double muInf = 0;
820 
821 	muInf	= 0.0;
822 
823 	for (j = 0; j < pabwid; j++)	// should for p2
824 	{
825 		for (i = 0; i < pabhei; i++)	// should for p1
826 		{
827 			if (pab2d[j][i] != 0 && p1[i] != 0 && p2[j] != 0)
828 			{
829 				muInf += pab2d[j][i] * log (pab2d[j][i] / p1[i] / p2[j]);
830 			//	printf("%f %fab %fa %fb\n",muInf,pab2d[j][i]/p1[i]/p2[j],p1[i],p2[j]);
831 			}
832 		}
833 	}
834 
835 	muInf	/= log(2.0);
836 
837 	//-----------------------------------------------------
838 	// free memory
839 
840 	DELETE_ARRAY(pab2d);
841 
842 	if( b_findmarginalprob != 0 )
843 	{
844 		DELETE_ARRAY(p1);
845 		DELETE_ARRAY(p2);
846 	}
847 
848 	return muInf;
849 }
850 
851 
852 ///////////////////////////////////////////////////////////
853 //														 //
854 ///////////////////////////////////////////////////////////
855 
856 //---------------------------------------------------------
Get_JointProb(T * img1,T * img2,long len,long maxstatenum,int & nstate1,int & nstate2)857 template <class T> double * CSG_mRMR::Get_JointProb(T * img1, T * img2, long len, long maxstatenum, int &nstate1, int &nstate2)
858 {
859 	long	i, j;
860 
861 	//-----------------------------------------------------
862 	// get and check size information
863 
864 	if (!img1 || !img2 || len < 0)
865 	{
866 		SG_UI_Msg_Add_Error("At least one of the input vectors is invalid.");
867 
868 		return( NULL );
869 	}
870 
871 //	int	b_findstatenum	= 1;	//  int nstate1 = 0, nstate2 = 0;
872 	int	b_returnprob	= 1;
873 
874 	//-----------------------------------------------------
875 	// copy data into new INT type array (hence quantization) and then reange them begin from 0 (i.e. state1)
876 
877 	int	*vec1	= new int[len];
878 	int	*vec2	= new int[len];
879 
880 	if( !vec1 || !vec2 )
881 	{
882 		SG_UI_Msg_Add_Error("Fail to allocate memory.\n");
883 
884 		return( NULL );
885 	}
886 
887 	int	nrealstate1 = 0, nrealstate2 = 0;
888 
889 	Copy_Vector(img1, len, vec1, nrealstate1);
890 	Copy_Vector(img2, len, vec2, nrealstate2);
891 
892 	//update the #state when necessary
893 	nstate1 = (nstate1 < nrealstate1) ? nrealstate1 : nstate1;
894 	//printf("First vector #state = %i\n",nrealstate1);
895 	nstate2 = (nstate2 < nrealstate2) ? nrealstate2 : nstate2;
896 	//printf("Second vector #state = %i\n",nrealstate2);
897 
898 	//  if (nstate1!=maxstatenum || nstate2!=maxstatenum)
899 	//    printf("find nstate1=%d nstate2=%d\n", nstate1, nstate2);
900 
901 	//-----------------------------------------------------
902 	// generate the joint-distribution table
903 
904 	double	 *hab	= new double[nstate1 * nstate2];
905 	double	**hab2d	= new double *[nstate2];
906 
907 	if( !hab || !hab2d )
908 	{
909 		SG_UI_Msg_Add_Error("Fail to allocate memory.");
910 
911 		return( NULL );
912 	}
913 
914 	for(j=0; j<nstate2; j++)
915 	{
916 		hab2d[j]	= hab + (long)j * nstate1;
917 	}
918 
919 	for(i=0; i<nstate1; i++)
920 	{
921 		for(j=0; j<nstate2; j++)
922 		{
923 			hab2d[j][i]	= 0;
924 		}
925 	}
926 
927 	for(i=0; i<len; i++)
928 	{
929 		// old method -- slow
930 		//    indx = (long)(vec2[i]) * nstate1 + vec1[i];
931 		//    hab[indx] += 1;
932 
933 		// new method -- fast
934 		hab2d[vec2[i]][vec1[i]]	+= 1;
935 		//printf("vec2[%d]=%d vec1[%d]=%d\n", i, vec2[i], i, vec1[i]);
936 	}
937 
938 	//-----------------------------------------------------
939 	// return the probabilities, otherwise return count numbers
940 
941 	if( b_returnprob )
942 	{
943 		for(i=0; i<nstate1; i++)
944 		{
945 			for(j=0; j<nstate2; j++)
946 			{
947 				hab2d[j][i]	/= len;
948 			}
949 		}
950 	}
951 
952 	//-----------------------------------------------------
953 	// finish
954 
955 	DELETE_ARRAY(hab2d);
956 	DELETE_ARRAY(vec1);
957 	DELETE_ARRAY(vec2);
958 
959 	return hab;
960 }
961 
962 //---------------------------------------------------------
Copy_Vector(T * srcdata,long len,int * desdata,int & nstate)963 template <class T> void CSG_mRMR::Copy_Vector(T *srcdata, long len, int *desdata, int &nstate)
964 {
965 	if (!srcdata || !desdata)
966 	{
967 		SG_UI_Msg_Add_Error("no points in Copy_Vector()!");
968 
969 		return;
970 	}
971 
972 	// note: originally I added 0.5 before rounding, however seems the negative numbers and
973 	//      positive numbers are all rounded towarded 0; hence int(-1+0.5)=0 and int(1+0.5)=1;
974 	//      This is unwanted because I need the above to be -1 and 1.
975 	// for this reason I just round with 0.5 adjustment for positive and negative differently
976 
977 	// copy data
978 	int	minn, maxx;
979 	if (srcdata[0] > 0)
980 	{
981 		maxx = minn = int (srcdata[0] + 0.5);
982 	}
983 	else
984 	{
985 		maxx = minn = int (srcdata[0] - 0.5);
986 	}
987 
988 	long	i;
989 	int		tmp;
990 	double	tmp1;
991 
992 	for (i = 0; i < len; i++)
993 	{
994 		tmp1	= double (srcdata[i]);
995 		tmp		= (tmp1 > 0) ? (int) (tmp1 + 0.5) : (int) (tmp1 - 0.5);	//round to integers
996 		minn	= (minn < tmp) ? minn : tmp;
997 		maxx	= (maxx > tmp) ? maxx : tmp;
998 		desdata[i] = tmp;
999 	//	printf("%i ",desdata[i]);
1000 	}
1001 
1002 	// make the vector data begin from 0 (i.e. 1st state)
1003 	for (i = 0; i < len; i++)
1004 	{
1005 		desdata[i] -= minn;
1006 	}
1007 
1008 	// return the #state
1009 	nstate = (maxx - minn + 1);
1010 
1011 	return;
1012 }
1013 
1014 
1015 ///////////////////////////////////////////////////////////
1016 //														 //
1017 //														 //
1018 //														 //
1019 ///////////////////////////////////////////////////////////
1020 
1021 //---------------------------------------------------------
1022 #endif // WITH_MRMR
1023