1 /*****************************************************************************************
2 * Copyright (c) 2006 Hewlett-Packard Development Company, L.P.
3 * Permission is hereby granted, free of charge, to any person obtaining a copy of
4 * this software and associated documentation files (the "Software"), to deal in
5 * the Software without restriction, including without limitation the rights to use,
6 * copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the
7 * Software, and to permit persons to whom the Software is furnished to do so,
8 * subject to the following conditions:
9 *
10 * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
11 *
12 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
13 * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
14 * PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
15 * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
16 * CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE
17 * OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
18 *****************************************************************************************/
19
20 /************************************************************************
21 * FILE DESCR: Definitions of Agglomerative Hierarchical Clustering module
22 *
23 * CONTENTS:
24 * cluster
25 * getProximityMatrix
26 * setOutputConfig
27 * setHyperlinkMap
28 * getClusterResult
29 * computeProximityMatrix
30 * computeDistances
31 * clusterToFindNumClusters
32 * getInterObjectDistance
33 * findGroup
34 * findInterClusterDistance
35 * writeClustersAsHTML
36 * determineNumOfClusters
37 * determineKnee
38 * findRMSE
39 * computeAvgSil
40 *
41 *
42 * AUTHOR: Bharath A
43 *
44 * DATE: February 22, 2005
45 * CHANGE HISTORY:
46 * Author Date Description of change
47 ************************************************************************/
48 #ifndef __LTKHIERARCHICALCLUSTERING_H
49 #define __LTKHIERARCHICALCLUSTERING_H
50
51
52 #ifndef _WIN32
53 //#include <values.h>
54 #endif
55
56 #include "LTKInc.h"
57 #include "LTKTypes.h"
58 #include "LTKLoggerUtil.h"
59 #include "LTKException.h"
60 #include "LTKErrors.h"
61
62 /*Enumerator for stopping criterion to be used*/
63 enum ELTKHCStoppingCriterion
64 {
65 LMETHOD,
66 AVG_SIL
67 };
68
69 /*Enumerator for methods in hierarchical clustering*/
70 enum ELTKHCMethod
71 {
72 SINGLE_LINKAGE,
73 COMPLETE_LINKAGE,
74 AVERAGE_LINKAGE
75 };
76
77 #define OUTPUT_HTML_FILE_NAME "output.html"
78 #define MIN_CUTOFF 20
79
80 /**
81 * @class LTKHierarchicalClustering
82 * <p> This class does agglomerative hierarchical clustering. The data objects
83 which could be LTKTrace or LTKTraceGroup, are supplied as a vector.
84 Function that defines the distance between two data objects needs to be
85 supplied as a function pointer.One of the 3 methods (Single,Average or
86 Complete linkage) needs to be selected to define the way inter-cluster
87 has to be determined. In case number of clusters is not supplied,
88 it is determined using the L-method (default stopping criterion)<p> */
89
90 template <class ClusterObjType,class DistanceClass>
91 class LTKHierarchicalClustering
92 {
93
94 private:
95
96 //reference to the vector containing the data objects to be clustered
97 const vector<ClusterObjType>& m_data;
98
99 //triangular matrix containing the pairwise distances between data
100 //objects
101 float2DVector m_proximityMatrix;
102
103 //data structure that stores current (intermediate) state of the
104 //clusters
105 int2DVector m_intermediateCG;
106
107 //vector mapping the data object id and path to the data (unipen) file
108 stringVector m_hyperlinksVec;
109
110 //contains the number of clusters required
111 int m_numOfClusters;
112
113 //output file handle to write the cluster results as html
114 //with name of the file as OUTPUT_HTML_FILE_NAME
115 ofstream m_output;
116
117 //flag to indicate whether the output is required as html
118 bool m_writeHTML;
119
120 //flag to indicate whether to show all levels of the hierarchy in the
121 //html file
122 bool m_showAllLevels;
123
124 //vector to hold merging distance for each number of clusters
125 floatVector m_mergingDist;
126
127 //flag for determining number of clusters
128 bool m_determineClusters;
129
130 //output result directory
131 string m_outputDir;
132
133 //extension of the image corresponding to each data object in
134 //order to write in the html
135 string m_imageFileExtn;
136
137 //Method for defining inter-cluster distance
138 ELTKHCMethod m_method;
139
140 //number of clusters determined by Average Silhouette method
141 int m_numBySil;
142
143 //cached clustering result corresponding minimum average silhouette
144 int2DVector m_cachedResult;
145
146 //stopping criterion selected - LMethod or Average Silhouette
147 ELTKHCStoppingCriterion m_stoppingCriterion;
148
149 //pointer to the class that has the definition of distance function
150 DistanceClass* m_distClassPtr;
151
152 //function pointer type of the function that defines inter-object distance
153 typedef int (DistanceClass::*FN_PTR_DISTANCE)(const ClusterObjType&,
154 const ClusterObjType&,
155 float&) ;
156
157 //distance function pointer
158 FN_PTR_DISTANCE m_distancePtr;
159
160
161 public:
162
163 /**********************************************************************************
164 * AUTHOR : Bharath A
165 * DATE : 22-FEB-2005
166 * NAME : LTKHierarchicalClustering
167 * DESCRIPTION : Constructor to initialise the required parameters
168 * ARGUMENTS : clusterObjects - vector of data objects which could be LKTrace or
169 * LTKTraceGroup for HWR
170 * noOfClusters - Number of clusters required
171 * clusteringMethod - One of the 3 methods:
172 * SINGLE_LINKAGE,AVERAGE_LINKAGE
173 * or COMPLETE_LINKAGE
174 *
175 * RETURNS :
176 * NOTES :
177 * CHANGE HISTROY
178 * Author Date Description of change
179 *************************************************************************************/
180
181
182 LTKHierarchicalClustering(const vector<ClusterObjType>& clusterObjects,int noOfClusters,
183 ELTKHCMethod clusteringMethod=AVERAGE_LINKAGE) :
m_data(clusterObjects)184 m_data(clusterObjects),m_method(clusteringMethod),
185 m_numOfClusters(noOfClusters),m_writeHTML(false),
186 m_showAllLevels(false),
187 m_determineClusters(false)
188 {
189
190 LOG(LTKLogger::LTK_LOGLEVEL_DEBUG)<<"Entering: "
191 <<"LTKHierarchicalClustering::LTKHierarchicalClustering"
192 <<"(vector<ClusterObjType>,int,ELTKHCMethod)"<<endl;
193
194 if(m_numOfClusters < 1 || m_numOfClusters>=clusterObjects.size())
195 {
196 LOG(LTKLogger::LTK_LOGLEVEL_ERR)
197 <<"Number of clusters:"<<m_numOfClusters<<endl;
198
199 LOG(LTKLogger::LTK_LOGLEVEL_ERR)
200 <<"Error : "<< EINVALID_NUM_CLUSTERS <<":"<< getErrorMessage(EINVALID_NUM_CLUSTERS)
201 <<" LTKHierarchicalClustering::"
202 <<"LTKHierarchicalClustering(vector<ClusterObjType>,int,ELTKHCMethod)"<<endl;
203
204 throw LTKException(EINVALID_NUM_CLUSTERS);
205 }
206
207 LOG(LTKLogger::LTK_LOGLEVEL_DEBUG)<<"Exiting: "
208 <<"LTKHierarchicalClustering::LTKHierarchicalClustering"
209 <<"(vector<ClusterObjType>,int,ELTKHCMethod)"<<endl;
210
211 }
212
213
214 /**********************************************************************************
215 * AUTHOR : Bharath A
216 * DATE : 22-FEB-2005
217 * NAME : LTKHierarchicalClustering
218 * DESCRIPTION : Constructor to initialise the required parameters.
219 * Number of clusters is determined by L-method
220 * ARGUMENTS : clusterObjects - vector of data objects which could be LKTrace or
221 * LTKTraceGroup for HWR
222 * clusteringMethod - One of the 3 methods:
223 * SINGLE_LINKAGE,AVERAGE_LINKAGE
224 * or COMPLETE_LINKAGE
225 * stoppingCriterion - stopping criterion to determine the
226 * right set of clusters: LMETHOD or AVG_SIL
227 *
228 * RETURNS :
229 * NOTES :
230 * CHANGE HISTROY
231 * Author Date Description of change
232 *************************************************************************************/
233
234
235 LTKHierarchicalClustering(const vector<ClusterObjType>& clusterObjects,
236 ELTKHCMethod clusteringMethod=AVERAGE_LINKAGE,
237 ELTKHCStoppingCriterion stoppingCriterion=LMETHOD) :
m_data(clusterObjects)238 m_data(clusterObjects),
239 m_method(clusteringMethod),
240 m_stoppingCriterion(stoppingCriterion),
241 m_writeHTML(false),
242 m_showAllLevels(false),
243 m_determineClusters(true)
244 {
245
246 LOG(LTKLogger::LTK_LOGLEVEL_DEBUG)<<"Entering: "
247 <<"LTKHierarchicalClustering::LTKHierarchicalClustering"
248 <<"(vector<ClusterObjType>,ELTKHCMethod,ELTKHCStoppingCriterion)"<<endl;
249
250 if(clusterObjects.size()==0)
251 {
252 LOG(LTKLogger::LTK_LOGLEVEL_ERR)
253 <<"Number of elements in clusterObjects vector is zero"<<endl;
254
255 LOG(LTKLogger::LTK_LOGLEVEL_ERR)
256 <<"Error : "<< ENO_DATA_TO_CLUSTER <<":"<< getErrorMessage(ENO_DATA_TO_CLUSTER)
257 <<" LTKHierarchicalClustering::LTKHierarchicalClustering"
258 <<"(vector<ClusterObjType>,ELTKHCMethod,ELTKHCStoppingCriterion)"<<endl;
259
260
261 throw LTKException(ENO_DATA_TO_CLUSTER);
262 }
263
264 if(clusterObjects.size() < 6 && stoppingCriterion == LMETHOD)
265 {
266 LOG(LTKLogger::LTK_LOGLEVEL_ERR)
267 <<"Number of elements in clusterObjects vector is:"
268 <<clusterObjects.size()<<endl;
269
270 LOG(LTKLogger::LTK_LOGLEVEL_ERR)
271 <<"Error : "<< EINSUFFICIENT_DATA_FOR_LMETHOD
272 <<":"<< getErrorMessage(EINSUFFICIENT_DATA_FOR_LMETHOD)
273 <<" LTKHierarchicalClustering::LTKHierarchicalClustering"
274 <<"(vector<ClusterObjType>,ELTKHCMethod,ELTKHCStoppingCriterion)"<<endl;
275
276 throw LTKException(EINSUFFICIENT_DATA_FOR_LMETHOD);
277 }
278
279 LOG(LTKLogger::LTK_LOGLEVEL_DEBUG)<<"Exiting: "
280 <<"LTKHierarchicalClustering::LTKHierarchicalClustering"
281 <<"(vector<ClusterObjType>,ELTKHCMethod,ELTKHCStoppingCriterion)"<<endl;
282
283 }
284
285
286
287 /**********************************************************************************
288 * AUTHOR : Bharath A
289 * DATE : 22-FEB-2005
290 * NAME : cluster
291 * DESCRIPTION : Clusters the input data objects. The number of clusters is determined
292 * based on the stopping criterion or as specified by the user.
293 * ARGUMENTS : distanceClassPtr - pointer to the class that contains the distance
294 * function defintion
295 * distFuncPtr - distance function pointer
296 * RETURNS : error code
297 * NOTES :
298 * CHANGE HISTROY
299 * Author Date Description of change
300 *************************************************************************************/
cluster(DistanceClass * distanceClassPtr,FN_PTR_DISTANCE distFuncPtr)301 int cluster(DistanceClass* distanceClassPtr,FN_PTR_DISTANCE distFuncPtr)
302 {
303
304 LOG(LTKLogger::LTK_LOGLEVEL_DEBUG)<<"Entering: "
305 <<"LTKHierarchicalClustering::cluster()"<<endl;
306
307
308 m_distancePtr=distFuncPtr;
309
310 m_distClassPtr=distanceClassPtr;
311
312
313 //To compute inter-object distances
314 int errorCode = computeDistances();
315
316 if (errorCode != SUCCESS )
317 {
318 LOG(LTKLogger::LTK_LOGLEVEL_ERR)
319 <<"Error: LTKHierarchicalClustering::cluster()"<<endl;
320
321 LTKReturnError(errorCode)
322 }
323
324 //if the user has specified the number of clusters
325 if(!m_determineClusters)
326 {
327 clusterToFindNumClusters();
328 }
329 else
330 {
331 m_numOfClusters=1;
332
333 //clustering to determine the number of
334 //clusters vs merging distance curve
335 clusterToFindNumClusters();
336
337 m_determineClusters=false;
338
339 if(m_stoppingCriterion==LMETHOD)
340 {
341 //Number of clusters determined by L-method
342 m_numOfClusters=determineNumOfClusters();
343
344 LOG(LTKLogger::LTK_LOGLEVEL_DEBUG)
345 <<"Number of clusters determined using L-Method"
346 <<m_numOfClusters<<endl;
347
348 }
349 else if(m_stoppingCriterion==AVG_SIL)
350 {
351
352 //Number of clusters determined by silhouette method
353 m_numOfClusters=m_numBySil;
354
355 LOG(LTKLogger::LTK_LOGLEVEL_DEBUG)
356 <<"Number of clusters determined using Average Silhouette method"
357 <<m_numOfClusters<<endl;
358
359 }
360
361 //clearing intermediate clusters formed during evaluation
362 //of the stopping criterion
363 m_intermediateCG.clear();
364
365 //clustering to the number of clusters determined
366 //by the stopping criterion
367 clusterToFindNumClusters();
368
369 }
370
371
372 LOG(LTKLogger::LTK_LOGLEVEL_DEBUG)<<"Exiting: "
373 <<"LTKHierarchicalClustering::cluster()"<<endl;
374 return SUCCESS;
375
376 }
377
378
379
380 /**********************************************************************************
381 * AUTHOR : Dinesh M
382 * DATE : 23-Jan-2006
383 * NAME : getProximityMatrix
384 * DESCRIPTION : returns the distance matrix
385 * ARGUMENTS :
386 * RETURNS : proximity matrix (float2DVector)
387 * NOTES :
388 * CHANGE HISTROY
389 * Author Date Description of change
390 *************************************************************************************/
391
getProximityMatrix()392 const float2DVector& getProximityMatrix() const
393 {
394
395 LOG(LTKLogger::LTK_LOGLEVEL_DEBUG)<<"Entering: "
396 <<"LTKHierarchicalClustering::getProximityMatrix()"<<endl;
397
398
399 LOG(LTKLogger::LTK_LOGLEVEL_DEBUG)<<"Exiting: "
400 <<"LTKHierarchicalClustering::getProximityMatrix()"<<endl;
401
402 return m_proximityMatrix;
403 }
404
405
406 /**********************************************************************************
407 * AUTHOR : Bharath A
408 * DATE : 22-FEB-2005
409 * NAME : setOutputConfig
410 * DESCRIPTION : This function sets the configuration for the output of clustering.
411 * ARGUMENTS : outputDirectory - path to the directory where output html
412 * is to be generated
413 * displayAllLevels - flag to indicate whether all levels in the
414 * clustering need to written to the file
415 * imageFileExtension - extension of the image file (ex)"png".
416 * If not specified <img> tag in the output html is not created.
417 * RETURNS : error code
418 * NOTES :
419 * CHANGE HISTROY
420 * Author Date Description of change
421 *************************************************************************************/
422 //int setOutputConfig(const string& outputDirectory,
423 // bool displayAllLevels=false,
424 // string imageFileExtension="")
425 int setOutputConfig(const string& outputDirectory,bool displayAllLevels=false,
426 const string& imageFileExtension="")
427 {
428
429 LOG(LTKLogger::LTK_LOGLEVEL_DEBUG)<<"Entering: "
430 <<"LTKHierarchicalClustering::setOutputConfig()"<<endl;
431
432 m_writeHTML=true;
433
434 m_showAllLevels=displayAllLevels;
435
436 string tempOutputFilePath=outputDirectory+"/"+OUTPUT_HTML_FILE_NAME;
437
438 m_output.open(tempOutputFilePath.c_str());
439
440 if(m_output.fail())
441 {
442
443 LOG(LTKLogger::LTK_LOGLEVEL_ERR)
444 <<"Unable to create file:"
445 <<tempOutputFilePath<<endl;
446
447 LOG(LTKLogger::LTK_LOGLEVEL_ERR)
448 <<"Error : "<< EFILE_CREATION_FAILED <<":"
449 <<getErrorMessage(EFILE_CREATION_FAILED)
450 <<" LTKHierarchicalClustering::setOutputConfig()" <<endl;
451
452 LTKReturnError(EFILE_CREATION_FAILED)
453 }
454
455 m_outputDir=outputDirectory;
456
457 LOG(LTKLogger::LTK_LOGLEVEL_DEBUG)
458 <<"Clustering results output directory:"
459 <<m_outputDir<<endl;
460
461 m_output.close();
462
463 //If it takes the default value, <img> tag in the output is not generated
464 m_imageFileExtn=imageFileExtension;
465
466 LOG(LTKLogger::LTK_LOGLEVEL_DEBUG)
467 <<"Image file extension:"<<m_imageFileExtn<<endl;
468
469 LOG(LTKLogger::LTK_LOGLEVEL_DEBUG)<<"Exiting: "
470 <<"LTKHierarchicalClustering::setOutputConfig()"<<endl;
471
472 return SUCCESS;
473 }
474
475
476 /**********************************************************************************
477 * AUTHOR : Bharath A
478 * DATE : 22-FEB-2005
479 * NAME : setHyperlinkMap
480 * DESCRIPTION : To set hyperlinks for each data object which refers to actual data file.
481 * Assumes one-to-one correspondence with the data vector
482 * passed in the constructor.
483 * ARGUMENTS : hyperlinksVector - Vector containing paths to physical
484 * files of each data object.
485 *
486 * RETURNS : error code
487 * NOTES :
488 * CHANGE HISTROY
489 * Author Date Description of change
490 *************************************************************************************/
491 //int setHyperlinkMap(const vector<string>& hyperlinksVector)
setHyperlinkMap(const vector<string> & hyperlinksVector)492 int setHyperlinkMap(const vector<string>& hyperlinksVector)
493 {
494
495 LOG(LTKLogger::LTK_LOGLEVEL_DEBUG)<<"Entering: "
496 <<"LTKHierarchicalClustering::setHyperlinkMap()"<<endl;
497
498 if(m_data.size()!=hyperlinksVector.size())
499 {
500 LOG(LTKLogger::LTK_LOGLEVEL_ERR)
501 <<"cluster objects vector size:"<<m_data.size()
502 <<" and hyperlinks vector size:"<<hyperlinksVector.size()<<endl;
503
504 LOG(LTKLogger::LTK_LOGLEVEL_ERR)
505 <<"Error : "<< EDATA_HYPERLINK_VEC_SIZE_MISMATCH
506 <<":"<< getErrorMessage(EDATA_HYPERLINK_VEC_SIZE_MISMATCH)
507 <<" LTKHierarchicalClustering::setHyperlinkMap()" <<endl;
508
509 LTKReturnError(EDATA_HYPERLINK_VEC_SIZE_MISMATCH);
510 }
511
512 m_hyperlinksVec=hyperlinksVector; //Vector for hyperlinks is set
513
514
515 LOG(LTKLogger::LTK_LOGLEVEL_DEBUG)<<"Exiting: "
516 <<"LTKHierarchicalClustering::setHyperlinkMap()"<<endl;
517
518 return SUCCESS;
519
520 }
521
522
523 /**********************************************************************************
524 * AUTHOR : Bharath A
525 * DATE : 22-FEB-2005
526 * NAME : getClusterResult
527 * DESCRIPTION : Populates the argument (vector of vectors) with data objects indices.
528 * Each row (inner vector) corresponds to a cluster.
529 * ARGUMENTS : outClusterResult - reference to result vector of vectors.
530 *
531 * RETURNS :
532 * NOTES :
533 * CHANGE HISTROY
534 * Author Date Description of change
535 *************************************************************************************/
getClusterResult(vector<vector<int>> & outClusterResult)536 void getClusterResult(vector<vector<int> >& outClusterResult) const
537 {
538 LOG(LTKLogger::LTK_LOGLEVEL_DEBUG)<<"Entering: "
539 <<"LTKHierarchicalClustering::getClusterResult()"<<endl;
540
541 for(int v=0;v<m_intermediateCG.size();v++)
542 {
543
544 outClusterResult.push_back(m_intermediateCG[v]);
545 }
546
547 LOG(LTKLogger::LTK_LOGLEVEL_DEBUG)<<"Exiting: "
548 <<"LTKHierarchicalClustering::getClusterResult()"<<endl;
549 }
550
551 /**********************************************************************************
552 * AUTHOR : Bharath A
553 * DATE : 22-FEB-2005
554 * NAME : computeProximityMatrix
555 * DESCRIPTION : Populates the argument (vector of vectors) with data objects indices.
556 * Each inner vector corresponds to a cluster.
557 * ARGUMENTS : distanceClassPtr - pointer to the class that has the distance definition
558 * distFuncPtr - function pointer to the distance function
559 * RETURNS : error code
560 * NOTES :
561 * CHANGE HISTROY
562 * Author Date Description of change
563 *************************************************************************************/
564
565 //void computeProximityMatrix(ComputeDistanceFunc computeDistFuncObj)
computeProximityMatrix(DistanceClass * distanceClassPtr,FN_PTR_DISTANCE distFuncPtr)566 int computeProximityMatrix(DistanceClass* distanceClassPtr,
567 FN_PTR_DISTANCE distFuncPtr)
568 {
569 LOG(LTKLogger::LTK_LOGLEVEL_DEBUG)<<"Entering: "
570 <<"LTKHierarchicalClustering::computeProximityMatrix()"<<endl;
571
572 m_distancePtr=distFuncPtr;
573 m_distClassPtr=distanceClassPtr;
574
575 int errorCode;
576
577 if((errorCode=computeDistances())!=SUCCESS)
578 {
579 LOG(LTKLogger::LTK_LOGLEVEL_ERR)
580 <<"Error: LTKHierarchicalClustering::computeProximityMatrix()"<<endl;
581
582 LTKReturnError(errorCode);
583 }
584
585 LOG(LTKLogger::LTK_LOGLEVEL_DEBUG)<<"Exiting: "
586 <<"LTKHierarchicalClustering::computeProximityMatrix()"<<endl;
587
588 return SUCCESS;
589 }
590
591
592
593 private:
594
595 /**********************************************************************************
596 * AUTHOR : Bharath A
597 * DATE : 22-FEB-2005
598 * NAME : computeDistances
599 * DESCRIPTION : Computes inter-object distances and puts them in the distance matrix
600 * ARGUMENTS :
601 * RETURNS : error code
602 * NOTES :
603 * CHANGE HISTROY
604 * Author Date Description of change
605 *************************************************************************************/
606
computeDistances()607 int computeDistances()
608 {
609
610 LOG(LTKLogger::LTK_LOGLEVEL_DEBUG)<<"Entering: "
611 <<"LTKHierarchicalClustering::computeDistances()"<<endl;
612
613 for(int i=0;i<(m_data.size()-1);++i)
614 {
615 vector<float> eachRow((m_data.size()-i)-1);//added -1 at the end
616
617 int c=0;
618
619 for(int j=i+1;j<m_data.size();++j)
620 {
621 //external distance function called
622 int errorCode = (m_distClassPtr->*m_distancePtr)(m_data[i],m_data[j], eachRow[c]);
623
624 if (errorCode != SUCCESS )
625 {
626 LOG(LTKLogger::LTK_LOGLEVEL_ERR)
627 <<"Error while calling distance function"<<endl;
628
629 LOG(LTKLogger::LTK_LOGLEVEL_ERR)
630 <<"Error: LTKHierarchicalClustering::computeDistances()"<<endl;
631
632 LTKReturnError(errorCode);
633 }
634
635 ++c;
636
637 }
638
639 m_proximityMatrix.push_back(eachRow);
640
641 }
642
643 LOG(LTKLogger::LTK_LOGLEVEL_DEBUG)<<"Exiting: "
644 <<"LTKHierarchicalClustering::computeDistances()"<<endl;
645
646 return SUCCESS;
647
648 }
649
650
651
652 /**********************************************************************************
653 * AUTHOR : Bharath A
654 * DATE : 22-FEB-2005
655 * NAME : clusterToFindNumClusters
656 * DESCRIPTION : Clusters the data objects hierarchically (agglomerative)
657 * till the desired number of
658 * clusters and also evaluates the stopping criterion selected.
659 * ARGUMENTS :
660 * RETURNS : error code
661 * NOTES :
662 * CHANGE HISTROY
663 * Author Date Description of change
664 *************************************************************************************/
665
clusterToFindNumClusters()666 int clusterToFindNumClusters()
667 {
668
669 LOG(LTKLogger::LTK_LOGLEVEL_DEBUG)<<"Entering: "
670 <<"LTKHierarchicalClustering::clusterToFindNumClusters()"<<endl;
671
672 if(m_stoppingCriterion==LMETHOD)
673 {
674 //Number of clusters needs to be determined
675 if(m_determineClusters)
676 {
677 //map for number of clusters vs. merging distance
678 m_mergingDist.reserve(m_data.size());
679 }
680 }
681 else if(m_stoppingCriterion==AVG_SIL)
682 {
683 if(m_writeHTML==false && m_cachedResult.size()>0)
684 {
685 m_intermediateCG=m_cachedResult;
686
687 return SUCCESS;
688 }
689 }
690
691 for(int i=0;i<m_data.size();i++)
692 {
693 vector<int> v;
694 v.push_back(i);
695
696 //To begin with, each data object is cluster by itself
697 m_intermediateCG.push_back(v);
698 }
699
700 if(m_writeHTML) //If output is needed as html
701 {
702 string outputFilePath=m_outputDir+"/"+OUTPUT_HTML_FILE_NAME;
703
704 m_output.open(outputFilePath.c_str()); //Cluster output file is created
705
706 if(m_output.fail())
707 {
708 LOG(LTKLogger::LTK_LOGLEVEL_ERR)
709 <<"Unable to create file:"
710 <<outputFilePath<<endl;
711
712 LOG(LTKLogger::LTK_LOGLEVEL_ERR)
713 <<"Error : "<< EFILE_CREATION_FAILED
714 <<":"<< getErrorMessage(EFILE_CREATION_FAILED)
715 <<" LTKHierarchicalClustering::clusterToFindNumClusters()" <<endl;
716
717 LTKReturnError(EFILE_CREATION_FAILED);
718 }
719
720 /*Html tags are written*/
721 m_output<<"<html>\n";
722 m_output<<"<body>\n";
723 m_output<<"<table border='1' bordercolor='black'>\n";
724
725
726 m_output<<"<tr>\n";
727
728
729 for(int v=0;v<m_intermediateCG.size();v++)
730 {
731
732 int clusterSize=m_intermediateCG[v].size();
733
734 m_output<<"<td colspan=\""<<clusterSize<<"\">";
735
736 for(int w=0;w<clusterSize;w++)
737 {
738 if(m_hyperlinksVec.size()>0)
739 {
740 m_output<<"<a href='"
741 <<m_hyperlinksVec[m_intermediateCG[v][w]]
742 <<"'>"<<m_intermediateCG[v][w]<<"</a> ";
743 }
744 else
745 {
746 m_output<<m_intermediateCG[v][w]<<" ";
747 }
748
749 //if there is an image file corresponding to each data object
750 if(!m_imageFileExtn.empty())
751 {
752 m_output<<"<img src=\""
753 <<m_intermediateCG[v][w]<<"."<<m_imageFileExtn
754 <<"\" border=\"0\"/> ";
755 }
756
757
758 }
759 }
760
761 m_output<<"<td><b>";
762 m_output<<"Inter-cluster Dist";
763 m_output<<"</b></td>";
764
765 m_output<<"</tr>\n";
766
767 }
768
769 if(m_numOfClusters<m_data.size() || m_determineClusters==true)
770 {
771
772 int currNumOfClusters = m_data.size();
773
774 //this local variable is used only for Average Silhouette method
775 float minSil=FLT_MAX;
776
777
778 for(int it=0;it<(m_data.size()-m_numOfClusters);++it)
779 {
780 vector<int> toCluster;
781
782 //to find the clusters that need to be merged
783 float interClusterDistance=findGroup(toCluster);
784
785 currNumOfClusters=m_data.size()-it-1;
786
787 if(m_stoppingCriterion==AVG_SIL)
788 {
789 float silDiff=computeAvgSil(toCluster[0],toCluster[1]);
790
791 if(silDiff<minSil)
792 {
793 minSil=silDiff;
794 if(currNumOfClusters > 2)
795 {
796 m_numBySil=currNumOfClusters+1;
797 m_cachedResult=m_intermediateCG;
798 }
799
800 }
801 }
802 else if(m_stoppingCriterion==LMETHOD && m_determineClusters==true)
803 {
804 m_mergingDist[currNumOfClusters]=interClusterDistance;
805 }
806
807
808 //clusters are merged
809 m_intermediateCG[toCluster[0]].insert(m_intermediateCG[toCluster[0]].end(),
810 m_intermediateCG[toCluster[1]].begin(),
811 m_intermediateCG[toCluster[1]].end());
812
813 //old cluster deleted
814 m_intermediateCG.erase(m_intermediateCG.begin()+ toCluster[1]);
815
816
817
818 if(m_writeHTML)
819 {
820 if(!m_showAllLevels)
821 {
822 if(currNumOfClusters==m_numOfClusters)
823 {
824 writeClustersAsHTML(interClusterDistance);
825 }
826 }
827 else
828 {
829 writeClustersAsHTML(interClusterDistance);
830 }
831
832 }
833
834
835 }
836
837
838 }
839
840
841 //closing the html tags
842 if(m_writeHTML)
843 {
844 m_output<<"</table>\n";
845 m_output<<"</body>\n";
846 m_output<<"</html>";
847
848 m_output.close();
849 }
850
851 LOG(LTKLogger::LTK_LOGLEVEL_DEBUG)<<"Exiting: "
852 <<"LTKHierarchicalClustering::clusterToFindNumClusters()"<<endl;
853
854 return SUCCESS;
855
856 }
857
858
859
860
861
862
863
864 /**********************************************************************************
865 * AUTHOR : Bharath A
866 * DATE : 22-FEB-2005
867 * NAME : getInterObjectDistance
868 * DESCRIPTION : Returns the distance between two data objects from the distance matrix
869 * ARGUMENTS : firstObjIndex - index of the first data object
870 * secondObjIndex - index of the second data object
871 * RETURNS : distance (float)
872 * NOTES :
873 * CHANGE HISTROY
874 * Author Date Description of change
875 *************************************************************************************/
getInterObjectDistance(int firstObjIndex,int secondObjIndex)876 float getInterObjectDistance(int firstObjIndex,int secondObjIndex) const
877 {
878 LOG(LTKLogger::LTK_LOGLEVEL_DEBUG)<<"Entering: "
879 <<"LTKHierarchicalClustering::getInterObjectDistance()"<<endl;
880
881 int row = 0;
882 int col = 0;
883
884 if(firstObjIndex < secondObjIndex)
885 {
886 row=firstObjIndex;
887 col=secondObjIndex;
888 }
889 else
890 {
891 row=secondObjIndex;
892 col=firstObjIndex;
893 } //lesser index is made as row and the other as column
894
895 LOG(LTKLogger::LTK_LOGLEVEL_DEBUG)<<"Exiting: "
896 <<"LTKHierarchicalClustering::getInterObjectDistance()"<<endl;
897
898 return m_proximityMatrix[row][col-row-1];
899 }
900
901 /**********************************************************************************
902 * AUTHOR : Bharath A
903 * DATE : 22-FEB-2005
904 * NAME : findGroup
905 * DESCRIPTION : Finds the indices in the m_intermediateCG (clusters) that
906 * need to be merged
907 * ARGUMENTS : pairToCombine - vector for storing the cluster indices
908 * RETURNS : inter cluster distance (float)
909 * NOTES :
910 * CHANGE HISTROY
911 * Author Date Description of change
912 *************************************************************************************/
913 //double findGroup(vector<int> pairToCombine)
findGroup(vector<int> & pairToCombine)914 float findGroup(vector<int>& pairToCombine) const
915 {
916 LOG(LTKLogger::LTK_LOGLEVEL_DEBUG)<<"Entering: "
917 <<"LTKHierarchicalClustering::findGroup()"<<endl;
918
919 float minDistance=FLT_MAX;
920
921 pairToCombine.clear();
922
923 pairToCombine.resize(2);
924
925 for(int i=0;i<m_intermediateCG.size();i++)
926 {
927 for(int j=i+1;j<m_intermediateCG.size();j++)
928 {
929 float tempDist=findInterClusterDistance(m_intermediateCG[i],
930 m_intermediateCG[j]);
931
932 if(tempDist<minDistance)
933 {
934
935 minDistance=tempDist;
936 pairToCombine[0]=i;
937 pairToCombine[1]=j;
938 }
939 }
940
941 }
942
943 LOG(LTKLogger::LTK_LOGLEVEL_DEBUG)<<"Exiting: "
944 <<"LTKHierarchicalClustering::findGroup()"<<endl;
945
946
947 return minDistance;
948
949 }
950
951
952 /**********************************************************************************
953 * AUTHOR : Bharath A
954 * DATE : 22-FEB-2005
955 * NAME : findInterClusterDistance
956 * DESCRIPTION : Finds the inter-cluster distance.
957 * The contents of each cluster are in the vectors.
958 * ARGUMENTS : v1 cluster one
959 * v2 cluster two
960 * RETURNS : inter cluster distance
961 * NOTES :
962 * CHANGE HISTROY
963 * Author Date Description of change
964 *************************************************************************************/
965
966 //double findInterClusterDistance(const vector<int>& v1,const vector<int>& v2)
findInterClusterDistance(const vector<int> & cluster1,const vector<int> & cluster2)967 float findInterClusterDistance(const vector<int>& cluster1, const vector<int>& cluster2) const
968 {
969 LOG(LTKLogger::LTK_LOGLEVEL_DEBUG)<<"Entering: "
970 <<"LTKHierarchicalClustering::findInterClusterDistance()"<<endl;
971
972 float groupDistance=0.0;
973
974 /*For single-linkage algorithm*/
975 if(m_method==SINGLE_LINKAGE)
976 {
977 groupDistance= FLT_MAX;
978
979 for(int i=0;i<cluster1.size();i++)
980 {
981 for(int j=0;j<cluster2.size();j++)
982 {
983 float temp=getInterObjectDistance(cluster1[i],cluster2[j]);
984
985 if(temp < groupDistance)
986 {
987 groupDistance=temp;
988 }
989 }
990 }
991 }
992
993
994 /*For average-linkage algorithm*/
995 if(m_method==AVERAGE_LINKAGE)
996 {
997 groupDistance=0.0;
998
999 for(int i=0;i<cluster1.size();i++)
1000 {
1001 for(int j=0;j<cluster2.size();j++)
1002 {
1003
1004 groupDistance+=getInterObjectDistance(cluster1[i],cluster2[j]);
1005 }
1006 }
1007
1008 groupDistance/=((float)(cluster1.size()*cluster2.size()));
1009 }
1010
1011
1012 /*For complete-linkage algorithm*/
1013 if(m_method==COMPLETE_LINKAGE)
1014 {
1015 groupDistance=0.0;
1016
1017 for(int i=0;i<cluster1.size();i++)
1018 {
1019 for(int j=0;j<cluster2.size();j++)
1020 {
1021 float temp=getInterObjectDistance(cluster1[i],cluster2[j]);
1022
1023 if(temp > groupDistance)
1024 {
1025 groupDistance=temp;
1026 }
1027 }
1028 }
1029 }
1030
1031 LOG(LTKLogger::LTK_LOGLEVEL_DEBUG)<<"Exiting: "
1032 <<"LTKHierarchicalClustering::findInterClusterDistance()"<<endl;
1033
1034
1035 return groupDistance;
1036
1037
1038 }
1039
1040 /**********************************************************************************
1041 * AUTHOR : Bharath A
1042 * DATE : 22-FEB-2005
1043 * NAME : writeClustersAsHTML
1044 * DESCRIPTION : Writes the cluster results as html with data objects' and clusters' ids.
1045 * If hyperlinks vector is set, provides links to actual files.
1046 * ARGUMENTS : interClustDist - merging distance of the new cluster formed
1047 * RETURNS :
1048 * NOTES :
1049 * CHANGE HISTROY
1050 * Author Date Description of change
1051 *************************************************************************************/
1052
writeClustersAsHTML(float interClustDist)1053 void writeClustersAsHTML(float interClustDist)
1054 {
1055
1056 LOG(LTKLogger::LTK_LOGLEVEL_DEBUG)<<"Entering: "
1057 <<"LTKHierarchicalClustering::writeClustersAsHTML()"<<endl;
1058
1059 m_output<<"<tr>\n";
1060
1061 for(int v=0;v<m_intermediateCG.size();v++)
1062 {
1063 int clusterSize=m_intermediateCG[v].size();
1064
1065 m_output<<"<td colspan=\""<<clusterSize<<"\">";
1066
1067 m_output<<"("<<v<<")<br>";
1068
1069 for(int w=0;w<clusterSize;w++)
1070 {
1071 if(m_hyperlinksVec.size()>0)
1072 {
1073 m_output<<"<a href='"<<m_hyperlinksVec[m_intermediateCG[v][w]]
1074 <<"'>"<<m_intermediateCG[v][w]<<"</a> ";
1075 }
1076 else
1077 {
1078 m_output<<m_intermediateCG[v][w]<<" ";
1079 }
1080 if(!m_imageFileExtn.empty())
1081 {
1082 m_output<<"<img src=\""<<m_intermediateCG[v][w]<<"."
1083 <<m_imageFileExtn
1084 <<"\" border=\"0\"/> ";
1085 }
1086
1087 }
1088 }
1089
1090 m_output<<"<td>";
1091
1092 m_output<<"("<<m_intermediateCG.size()
1093 <<") <b>"<<interClustDist<<"</b>";
1094
1095 m_output<<"</td>";
1096
1097 m_output<<"</tr>\n";
1098
1099 LOG(LTKLogger::LTK_LOGLEVEL_DEBUG)<<"Exiting: "
1100 <<"LTKHierarchicalClustering::writeClustersAsHTML()"<<endl;
1101
1102 }
1103
1104 /**********************************************************************************
1105 * AUTHOR : Bharath A
1106 * DATE : 22-FEB-2005
1107 * NAME : determineNumOfClusters
1108 * DESCRIPTION : Determines the number of clusters to be formed using
1109 * iterative refinement of L-method.
1110 * REFERENCE:S. Salvador and P. Chan. Determining the number of clusters/
1111 * segments in hierarchical clustering/segmentation algorithms.
1112 * Proceedings of 16th IEEE International Conference
1113 * on Tools with Artificial Intelligence, 3:1852-1857, 2004.
1114 * ARGUMENTS :
1115 * RETURNS : number of clusters
1116 * NOTES :
1117 * CHANGE HISTROY
1118 * Author Date Description of change
1119 *************************************************************************************/
1120
determineNumOfClusters()1121 int determineNumOfClusters() const
1122 {
1123 LOG(LTKLogger::LTK_LOGLEVEL_DEBUG)<<"Entering: "
1124 <<"LTKHierarchicalClustering::determineNumOfClusters()"<<endl;
1125
1126 int cutOff=(int)m_mergingDist.capacity()-1;
1127
1128 int lastKnee=0;
1129 int currentKnee=0;
1130
1131 lastKnee=cutOff;
1132
1133 currentKnee=lastKnee;
1134
1135 bool trueCutOff=false;
1136
1137 /*Iterative refinement of the L-method result*/
1138
1139 while(true)
1140 {
1141
1142 lastKnee=currentKnee;
1143
1144 currentKnee=determineKnee(cutOff);
1145
1146 if(trueCutOff)
1147 {
1148
1149 if(currentKnee >= lastKnee)
1150 {
1151
1152 break;
1153 }
1154 }
1155
1156 int temp=currentKnee*2;
1157
1158 if(temp>cutOff)
1159 {
1160 --cutOff;
1161
1162 trueCutOff=false;
1163 }
1164 else
1165 {
1166 cutOff=temp;
1167
1168 trueCutOff=true;
1169 }
1170
1171
1172 if(cutOff < MIN_CUTOFF)
1173 {
1174
1175 break;
1176 }
1177
1178
1179 }
1180
1181 LOG(LTKLogger::LTK_LOGLEVEL_DEBUG)
1182 <<"Number of clusters determined by iterative refinement:"
1183 <<currentKnee<<endl;
1184
1185 LOG(LTKLogger::LTK_LOGLEVEL_DEBUG)<<"Exiting: "
1186 <<"LTKHierarchicalClustering::determineNumOfClusters()"<<endl;
1187
1188 return currentKnee;
1189 }
1190
1191
1192
1193
1194 /**********************************************************************************
1195 * AUTHOR : Bharath A
1196 * DATE : 22-FEB-2005
1197 * NAME : determineKnee
1198 * DESCRIPTION : Determines the knee of the given number of clusters vs. merging distance curve
1199 * ARGUMENTS : maxNumClusters - maximum number of clusters
1200 * RETURNS : knee point of the curve
1201 * NOTES :
1202 * CHANGE HISTROY
1203 * Author Date Description of change
1204 *************************************************************************************/
determineKnee(int maxNumClusters)1205 int determineKnee(int maxNumClusters) const
1206 {
1207 LOG(LTKLogger::LTK_LOGLEVEL_DEBUG)<<"Entering: "
1208 <<"LTKHierarchicalClustering::determineKnee()"<<endl;
1209
1210 int result=0;
1211
1212 float minRMSE = FLT_MAX;
1213
1214 //atleast two points are required to fit lines, hence c ranges
1215 //from 3 to maxNumClusters-2
1216 for(int c=3;c<maxNumClusters-2;c++)
1217 {
1218
1219 float lRMSE=0,rRMSE=0;
1220
1221 findRMSE(c,maxNumClusters,lRMSE,rRMSE);
1222
1223 float cRMSE=(((float)(c-1)/(float)(maxNumClusters-1))*lRMSE)
1224 +(((float)(maxNumClusters-c)/(float)(maxNumClusters-1))*rRMSE);
1225
1226 if(cRMSE<minRMSE)
1227 {
1228
1229 minRMSE=cRMSE;
1230
1231 result=c;
1232 }
1233 }
1234
1235 LOG(LTKLogger::LTK_LOGLEVEL_DEBUG)<<"Exiting: "
1236 <<"LTKHierarchicalClustering::determineKnee()"<<endl;
1237
1238 return result+1;
1239 }
1240
1241
1242
1243
1244 /**********************************************************************************
1245 * AUTHOR : Bharath A
1246 * DATE : 22-FEB-2005
1247 * NAME : determineKnee
1248 * DESCRIPTION : Determines left and right RMSE values of the given point
1249 * on the no. of clusters vs. merging distance curve.
1250 * It fits two regression lines on either side of the point to find RMSEs
1251 * ARGUMENTS : candidateKnee - candidata knee point
1252 * maxNumClusters - upper bound on number of clusters
1253 * lRMSE - output RMSE on the left of c
1254 * rRMSE - output RMSE on the right of c
1255 * RETURNS :
1256 * NOTES :
1257 * CHANGE HISTROY
1258 * Author Date Description of change
1259 *************************************************************************************/
findRMSE(int candidateKnee,int maxNumClusters,float & outLRMSE,float & outRRMSE)1260 void findRMSE(int candidateKnee,int maxNumClusters,float& outLRMSE,float& outRRMSE) const
1261 {
1262 LOG(LTKLogger::LTK_LOGLEVEL_DEBUG)<<"Entering: "
1263 <<"LTKHierarchicalClustering::findRMSE()"<<endl;
1264
1265 float avgXLeft = 0;
1266 float avgYLeft = 0;
1267 float avgXRight = 0;
1268 float avgYRight = 0;
1269
1270 /*Regression line coefficients*/
1271 float beta0Left = 0;
1272 float beta1Left = 0;
1273 float beta0Right = 0;
1274 float beta1Right = 0;
1275
1276 int i = 0;
1277 int j = 0;
1278
1279 for(i=2;i<=candidateKnee;i++)
1280 {
1281 avgYLeft+=m_mergingDist[i];
1282
1283 avgXLeft+=i;
1284 }
1285
1286
1287 avgYLeft /= (candidateKnee-1);
1288 avgXLeft /= (candidateKnee-1);
1289
1290
1291 for(j=candidateKnee+1;j<=maxNumClusters;j++)
1292 {
1293
1294 avgYRight += m_mergingDist[j];
1295 avgXRight += j;
1296 }
1297
1298 avgYRight /= (maxNumClusters-candidateKnee);
1299 avgXRight /= (maxNumClusters-candidateKnee);
1300
1301 float numer=0;
1302 float denom=0;
1303
1304 for(i=2;i<=candidateKnee;i++)
1305 {
1306
1307 numer += ((i-avgXLeft)*(m_mergingDist[i]-avgYLeft));
1308 denom += ((i-avgXLeft)*(i-avgXLeft));
1309 }
1310
1311 beta1Left = numer/denom;
1312 beta0Left = avgYLeft-(beta1Left*avgXLeft);
1313
1314 numer=0;denom=0;
1315
1316 for(j=candidateKnee+1;j<=maxNumClusters;j++)
1317 {
1318 numer += ((j-avgXRight)*(m_mergingDist[j]-avgYRight));
1319 denom += ((j-avgXRight)*(j-avgXRight));
1320 }
1321
1322 if(denom > EPS)
1323 {
1324 beta1Right = numer / denom;
1325 }
1326 else
1327 {
1328 beta1Right = 0;
1329 }
1330
1331 beta0Right = avgYRight-(beta1Right*avgXRight);
1332
1333 float errorSOS=0;
1334
1335 for(i=2;i<=candidateKnee;i++)
1336 {
1337
1338 float yCap = (beta0Left+(beta1Left*i));
1339
1340 errorSOS += ((m_mergingDist[i]-yCap)*(m_mergingDist[i]-yCap));
1341
1342 }
1343
1344 outLRMSE=sqrt(errorSOS/(candidateKnee-2));
1345
1346 errorSOS=0;
1347
1348 for(j=candidateKnee+1;j<=maxNumClusters;j++)
1349 {
1350
1351 float yCap = beta0Right + (beta1Right*j);
1352
1353 errorSOS += (m_mergingDist[j]-yCap)*(m_mergingDist[j]-yCap);
1354
1355 }
1356
1357 outRRMSE=sqrt(errorSOS/(maxNumClusters-candidateKnee-1));
1358
1359 LOG(LTKLogger::LTK_LOGLEVEL_DEBUG)<<"Exiting: "
1360 <<"LTKHierarchicalClustering::findRMSE()"<<endl;
1361
1362 }
1363
1364
1365
1366
1367 /**********************************************************************************
1368 * AUTHOR : Bharath A
1369 * DATE : 22-FEB-2005
1370 * NAME : computeAvgSil
1371 * DESCRIPTION : Funtion that determines the change in the ratio of
1372 * intra and inter cluster similarity before and after merging
1373 * ARGUMENTS : clust1Index - index of cluster one
1374 * clust2Index - index of cluster two
1375 * RETURNS : average silhouette computed
1376 * NOTES :
1377 * CHANGE HISTROY
1378 * Author Date Description of change
1379 *************************************************************************************/
computeAvgSil(int clust1Index,int clust2Index)1380 float computeAvgSil(int clust1Index,int clust2Index) const
1381 {
1382 LOG(LTKLogger::LTK_LOGLEVEL_DEBUG)<<"Entering: "
1383 <<"LTKHierarchicalClustering::computeAvgSil()"<<endl;
1384
1385 const vector<int>& clust1=m_intermediateCG[clust1Index];
1386
1387 const vector<int>& clust2=m_intermediateCG[clust2Index];
1388
1389 vector<int> combinedClust;
1390
1391 combinedClust.insert(combinedClust.end(),clust1.begin(),clust1.end());
1392
1393 combinedClust.insert(combinedClust.end(),clust2.begin(),clust2.end());
1394
1395 float clust1TotalSils=0.0f,clust2TotalSils=0.0f,combinedClustTotalSils=0.0f;
1396
1397 for(int i=0;i<clust1.size();i++)
1398 {
1399 int dataObj=clust1[i];
1400
1401 float avgIntraDist=0.0;
1402
1403 if(clust1.size()>1)
1404 {
1405 for(int j=0;j<clust1.size();j++)
1406 {
1407 if(clust1[j]!=dataObj)
1408 {
1409 avgIntraDist+=getInterObjectDistance(dataObj,clust1[j]);
1410
1411 }
1412 }
1413
1414 avgIntraDist/=((float)(clust1.size()-1));
1415 }
1416
1417 float minInterDist= FLT_MAX;
1418
1419 for(int r=0;r<m_intermediateCG.size();r++)
1420 {
1421 float avgInterDist=0.0;
1422
1423 if(r!=clust1Index)
1424 {
1425 for(int c=0;c<m_intermediateCG[r].size();c++)
1426 {
1427 avgInterDist+=getInterObjectDistance(dataObj,m_intermediateCG[r][c]);
1428 }
1429
1430 avgInterDist/=((float)m_intermediateCG[r].size());
1431
1432 if(avgInterDist<minInterDist)
1433 {
1434 minInterDist=avgInterDist;
1435 }
1436 }
1437
1438 }
1439
1440 float dataObjSil=0.0;
1441
1442 if(minInterDist > avgIntraDist && minInterDist > EPS)
1443 {
1444
1445 dataObjSil=(minInterDist-avgIntraDist)/minInterDist;
1446
1447 }
1448 else if(avgIntraDist > EPS)
1449 {
1450
1451 dataObjSil=(minInterDist-avgIntraDist)/avgIntraDist;
1452
1453 }
1454
1455 clust1TotalSils+=dataObjSil;
1456 }
1457
1458 for(int ii=0;ii<clust2.size();ii++)
1459 {
1460 int dataObj=clust2[ii];
1461
1462 float avgIntraDist=0.0;
1463
1464 if(clust2.size()>1)
1465 {
1466 for(int j=0;j<clust2.size();j++)
1467 {
1468 if(clust2[j]!=dataObj)
1469 {
1470
1471 avgIntraDist+=getInterObjectDistance(dataObj,clust2[j]);
1472
1473 }
1474 }
1475
1476 avgIntraDist/=((float)(clust2.size()-1));
1477 }
1478
1479 float minInterDist= FLT_MAX;
1480
1481 for(int r=0;r<m_intermediateCG.size();r++)
1482 {
1483 float avgInterDist=0.0;
1484
1485 if(r!=clust2Index)
1486 {
1487 for(int c=0;c<m_intermediateCG[r].size();c++)
1488 {
1489 avgInterDist+=getInterObjectDistance(dataObj,
1490 m_intermediateCG[r][c]);
1491 }
1492
1493 avgInterDist/=((float)m_intermediateCG[r].size());
1494
1495 if(avgInterDist < minInterDist)
1496 {
1497 minInterDist=avgInterDist;
1498 }
1499 }
1500
1501 }
1502 float dataObjSil=0.0;
1503
1504 if(minInterDist > avgIntraDist && minInterDist > EPS)
1505 {
1506 dataObjSil=(minInterDist-avgIntraDist)/minInterDist;
1507 }
1508 else if(avgIntraDist > EPS)
1509 {
1510 dataObjSil=(minInterDist-avgIntraDist)/avgIntraDist;
1511 }
1512
1513 clust2TotalSils+=dataObjSil;
1514 }
1515
1516
1517 for(int iii=0;iii<combinedClust.size();iii++)
1518 {
1519 int dataObj=combinedClust[iii];
1520
1521 float avgIntraDist=0.0;
1522
1523 if(combinedClust.size()>1)
1524 {
1525 for(int j=0;j<combinedClust.size();j++)
1526 {
1527 if(combinedClust[j]!=dataObj)
1528 {
1529 avgIntraDist+=getInterObjectDistance(dataObj,combinedClust[j]);
1530
1531 }
1532 }
1533 avgIntraDist/=((float)(combinedClust.size()-1));
1534 }
1535
1536 float minInterDist=FLT_MAX;
1537
1538 for(int r=0;r<m_intermediateCG.size();r++)
1539 {
1540
1541 if(r!=clust1Index && r!=clust2Index)
1542 {
1543 float avgInterDist=0.0f;
1544
1545 for(int c=0;c<m_intermediateCG[r].size();c++)
1546 {
1547 avgInterDist+=getInterObjectDistance(dataObj,m_intermediateCG[r][c]);
1548 }
1549
1550 avgInterDist = (float)avgInterDist/ ((float)m_intermediateCG[r].size());
1551
1552 if(avgInterDist<minInterDist)
1553 {
1554 minInterDist=avgInterDist;
1555
1556 }
1557 }
1558
1559 }
1560
1561 float dataObjSil=0.0;
1562
1563 if(minInterDist>avgIntraDist && minInterDist > EPS)
1564 {
1565
1566 dataObjSil=(minInterDist-avgIntraDist)/minInterDist;
1567
1568 }
1569 else if(avgIntraDist > EPS)
1570 {
1571
1572 dataObjSil=(minInterDist-avgIntraDist)/avgIntraDist;
1573
1574 }
1575
1576 combinedClustTotalSils+=dataObjSil;
1577 }
1578
1579
1580 LOG(LTKLogger::LTK_LOGLEVEL_DEBUG)<<"Exiting: "
1581 <<"LTKHierarchicalClustering::computeAvgSil()"<<endl;
1582
1583
1584 return (combinedClustTotalSils-clust1TotalSils-clust2TotalSils);
1585
1586 }
1587
1588
1589
1590 };
1591 #endif
1592
1593