1 
2 ///////////////////////////////////////////////////////////
3 //                                                       //
4 //                         SAGA                          //
5 //                                                       //
6 //      System for Automated Geoscientific Analyses      //
7 //                                                       //
8 //                     Tool Library                      //
9 //                    Table_Calculus                     //
10 //                                                       //
11 //-------------------------------------------------------//
12 //                                                       //
13 //               table_cluster_analysis.cpp              //
14 //                                                       //
15 //                 Copyright (C) 2010 by                 //
16 //                      Olaf Conrad                      //
17 //                                                       //
18 //-------------------------------------------------------//
19 //                                                       //
20 // This file is part of 'SAGA - System for Automated     //
21 // Geoscientific Analyses'. SAGA is free software; you   //
22 // can redistribute it and/or modify it under the terms  //
23 // of the GNU General Public License as published by the //
24 // Free Software Foundation, either version 2 of the     //
25 // License, or (at your option) any later version.       //
26 //                                                       //
27 // SAGA is distributed in the hope that it will be       //
28 // useful, but WITHOUT ANY WARRANTY; without even the    //
29 // implied warranty of MERCHANTABILITY or FITNESS FOR A  //
30 // PARTICULAR PURPOSE. See the GNU General Public        //
31 // License for more details.                             //
32 //                                                       //
33 // You should have received a copy of the GNU General    //
34 // Public License along with this program; if not, see   //
35 // <http://www.gnu.org/licenses/>.                       //
36 //                                                       //
37 //-------------------------------------------------------//
38 //                                                       //
39 //    e-mail:     oconrad@saga-gis.org                   //
40 //                                                       //
41 //    contact:    Olaf Conrad                            //
42 //                Institute of Geography                 //
43 //                University of Hamburg                  //
44 //                Germany                                //
45 //                                                       //
46 ///////////////////////////////////////////////////////////
47 
48 //---------------------------------------------------------
49 #include "table_cluster_analysis.h"
50 
51 
52 ///////////////////////////////////////////////////////////
53 //														 //
54 //														 //
55 //														 //
56 ///////////////////////////////////////////////////////////
57 
58 //---------------------------------------------------------
CTable_Cluster_Analysis(bool bShapes)59 CTable_Cluster_Analysis::CTable_Cluster_Analysis(bool bShapes)
60 {
61 	Set_Author		("O. Conrad (c) 2010");
62 
63 	Set_Description	(_TW(
64 		"Cluster Analysis for tables."
65 	));
66 
67 	Add_Reference("Forgy, E.", "1965",
68 		"Cluster Analysis of multivariate data: efficiency vs. interpretability of classifications",
69 		"Biometrics 21:768."
70 	);
71 
72 	Add_Reference("Rubin, J.", "1967",
73 		"Optimal Classification into Groups: An Approach for Solving the Taxonomy Problem",
74 		"J. Theoretical Biology, 15:103-144."
75 	);
76 
77 	//-----------------------------------------------------
78 	if( (m_bShapes = bShapes) == true )
79 	{
80 		Set_Name		(_TL("Cluster Analysis (Shapes)"));
81 
82 		Parameters.Add_Shapes("", "INPUT" , _TL("Shapes"), _TL(""), PARAMETER_INPUT);
83 		Parameters.Add_Shapes("", "RESULT", _TL("Result"), _TL(""), PARAMETER_OUTPUT_OPTIONAL);
84 	}
85 	else
86 	{
87 		Set_Name		(_TL("Cluster Analysis"));
88 
89 		Parameters.Add_Table("", "INPUT"  , _TL("Table" ), _TL(""), PARAMETER_INPUT);
90 		Parameters.Add_Table("", "RESULT" , _TL("Result"), _TL(""), PARAMETER_OUTPUT_OPTIONAL);
91 	}
92 
93 	Parameters.Add_Table_Fields("INPUT",
94 		"FIELDS"		, _TL("Attributes"),
95 		_TL("")
96 	);
97 
98 	Parameters.Add_Table_Field("INPUT",
99 		"CLUSTER"		, _TL("Cluster"),
100 		_TL(""),
101 		true
102 	);
103 
104 	Parameters.Add_Table("",
105 		"STATISTICS"	, _TL("Statistics"),
106 		_TL(""),
107 		PARAMETER_OUTPUT
108 	);
109 
110 	Parameters.Add_Choice("",
111 		"METHOD"		, _TL("Method"),
112 		_TL(""),
113 		CSG_String::Format("%s|%s|%s",
114 			_TL("Iterative Minimum Distance (Forgy 1965)"),
115 			_TL("Hill-Climbing (Rubin 1967)"),
116 			_TL("Combined Minimum Distance / Hillclimbing")
117 		), 1
118 	);
119 
120 	Parameters.Add_Int("",
121 		"NCLUSTER"	, _TL("Number of Clusters"),
122 		_TL(""),
123 		10, 2, true
124 	);
125 
126 	Parameters.Add_Bool("",
127 		"NORMALISE"	, _TL("Normalise"),
128 		_TL(""),
129 		false
130 	);
131 }
132 
133 
134 ///////////////////////////////////////////////////////////
135 //														 //
136 ///////////////////////////////////////////////////////////
137 
138 //---------------------------------------------------------
On_Execute(void)139 bool CTable_Cluster_Analysis::On_Execute(void)
140 {
141 	bool					bNormalize;
142 	int						iFeature, nFeatures, *Features, iElement, nElements, Cluster;
143 	CSG_Cluster_Analysis	Analysis;
144 	CSG_Table				*pTable;
145 
146 	//-----------------------------------------------------
147 	pTable		= Parameters("RESULT"   )->asTable();
148 	bNormalize	= Parameters("NORMALISE")->asBool();
149 	Cluster		= Parameters("CLUSTER"  )->asInt();
150 
151 	Features	= (int *)Parameters("FIELDS")->asPointer();
152 	nFeatures	=        Parameters("FIELDS")->asInt    ();
153 
154 	if( !Features || nFeatures <= 0 )
155 	{
156 		Error_Set(_TL("no features in selection"));
157 
158 		return( false );
159 	}
160 
161 	if( !Analysis.Create(nFeatures) )
162 	{
163 		Error_Set(_TL("could not initialize cluster engine"));
164 
165 		return( false );
166 	}
167 
168 	if( pTable && pTable != Parameters("INPUT")->asTable() )
169 	{
170 		if( m_bShapes )
171 		{
172 			((CSG_Shapes *)pTable)->Create(*Parameters("INPUT")->asShapes());
173 		}
174 		else
175 		{
176 			pTable->Create(*Parameters("INPUT")->asTable());
177 		}
178 	}
179 	else
180 	{
181 		pTable	= Parameters("INPUT")->asTable();
182 	}
183 
184 	if( Cluster < 0 )
185 	{
186 		Cluster	= pTable->Get_Field_Count();
187 
188 		pTable->Add_Field(_TL("CLUSTER"), SG_DATATYPE_Int);
189 	}
190 
191 	//-----------------------------------------------------
192 	for(iElement=0, nElements=0; iElement<pTable->Get_Count() && Set_Progress(iElement, pTable->Get_Count()); iElement++)
193 	{
194 		CSG_Table_Record	*pRecord	= pTable->Get_Record(iElement);
195 
196 		bool	bNoData		= false;
197 
198 		for(iFeature=0; iFeature<nFeatures && !bNoData; iFeature++)
199 		{
200 			if( pRecord->is_NoData(Features[iFeature]) )
201 			{
202 				bNoData	= true;
203 			}
204 		}
205 
206 		if( bNoData || !Analysis.Add_Element() )
207 		{
208 			pRecord->Set_NoData(Cluster);
209 		}
210 		else
211 		{
212 			pRecord->Set_Value(Cluster, 0.0);
213 
214 			for(iFeature=0; iFeature<nFeatures; iFeature++)
215 			{
216 				double	d	= pRecord->asDouble(Features[iFeature]);
217 
218 				if( bNormalize )
219 				{
220 					d	= (d - pTable->Get_Mean(Features[iFeature])) / pTable->Get_StdDev(Features[iFeature]);
221 				}
222 
223 				Analysis.Set_Feature(nElements, iFeature, d);
224 			}
225 
226 			nElements++;
227 		}
228 	}
229 
230 	if( nElements <= 1 )
231 	{
232 		return( false );
233 	}
234 
235 	//-----------------------------------------------------
236 	bool	bResult	= Analysis.Execute(Parameters("METHOD")->asInt(), Parameters("NCLUSTER")->asInt());
237 
238 	for(iElement=0, nElements=0; iElement<pTable->Get_Count(); iElement++)
239 	{
240 		Set_Progress(iElement, pTable->Get_Count());
241 
242 		CSG_Table_Record	*pRecord	= pTable->Get_Record(iElement);
243 
244 		if( !pRecord->is_NoData(Cluster) )
245 		{
246 			pRecord->Set_Value(Cluster, Analysis.Get_Cluster(nElements++));
247 		}
248 	}
249 
250 	Save_Statistics(pTable, Features, bNormalize, Analysis);
251 
252 //	Save_LUT(pCluster, Analysis.Get_nClusters());
253 
254 	DataObject_Update(pTable);
255 
256 	return( bResult );
257 }
258 
259 
260 ///////////////////////////////////////////////////////////
261 //														 //
262 ///////////////////////////////////////////////////////////
263 
264 //---------------------------------------------------------
Save_Statistics(CSG_Table * pTable,int * Features,bool bNormalize,const CSG_Cluster_Analysis & Analysis)265 void CTable_Cluster_Analysis::Save_Statistics(CSG_Table *pTable, int *Features, bool bNormalize, const CSG_Cluster_Analysis &Analysis)
266 {
267 	int			iCluster, iFeature;
268 	CSG_String	s;
269 	CSG_Table	*pStatistics;
270 
271 	pStatistics	= Parameters("STATISTICS")->asTable();
272 
273 	pStatistics->Destroy();
274 	pStatistics->Set_Name(_TL("Cluster Analysis"));
275 
276 	pStatistics->Add_Field(_TL("ClusterID")	, SG_DATATYPE_Int);
277 	pStatistics->Add_Field(_TL("Elements")	, SG_DATATYPE_Int);
278 	pStatistics->Add_Field(_TL("Std.Dev.")	, SG_DATATYPE_Double);
279 
280 	s.Printf("\n%s:\t%ld \n%s:\t%d \n%s:\t%d \n%s:\t%f\n\n%s\t%s\t%s",
281 		_TL("Number of Elements")		, Analysis.Get_nElements(),
282 		_TL("Number of Variables")		, Analysis.Get_nFeatures(),
283 		_TL("Number of Clusters")		, Analysis.Get_nClusters(),
284 		_TL("Value of Target Function")	, Analysis.Get_SP(),
285 		_TL("Cluster"), _TL("Elements"), _TL("Std.Dev.")
286 	);
287 
288 	for(iFeature=0; iFeature<Analysis.Get_nFeatures(); iFeature++)
289 	{
290 		s	+= CSG_String::Format("\t%s", pTable->Get_Field_Name(Features[iFeature]));
291 
292 		pStatistics->Add_Field(pTable->Get_Field_Name(Features[iFeature]), SG_DATATYPE_Double);
293 	}
294 
295 	Message_Add(s);
296 
297 	for(iCluster=0; iCluster<Analysis.Get_nClusters(); iCluster++)
298 	{
299 		s.Printf("\n%d\t%d\t%f", iCluster, Analysis.Get_nMembers(iCluster), sqrt(Analysis.Get_Variance(iCluster)));
300 
301 		CSG_Table_Record	*pRecord	= pStatistics->Add_Record();
302 
303 		pRecord->Set_Value(0, iCluster);
304 		pRecord->Set_Value(1, Analysis.Get_nMembers(iCluster));
305 		pRecord->Set_Value(2, sqrt(Analysis.Get_Variance(iCluster)));
306 
307 		for(iFeature=0; iFeature<Analysis.Get_nFeatures(); iFeature++)
308 		{
309 			double	Centroid	= Analysis.Get_Centroid(iCluster, iFeature);
310 
311 			if( bNormalize )
312 			{
313 				Centroid	= pTable->Get_Mean(Features[iFeature]) + Centroid * pTable->Get_StdDev(Features[iFeature]);
314 			}
315 
316 			s	+= CSG_String::Format("\t%f", Centroid);
317 
318 			pRecord->Set_Value(iFeature + 3, Centroid);
319 		}
320 
321 		Message_Add(s, false);
322 	}
323 }
324 
325 
326 ///////////////////////////////////////////////////////////
327 //														 //
328 //														 //
329 //														 //
330 ///////////////////////////////////////////////////////////
331 
332 //---------------------------------------------------------
333