1 /*****************************************************************************************
2 * Copyright (c) 2006 Hewlett-Packard Development Company, L.P.
3 * Permission is hereby granted, free of charge, to any person obtaining a copy of
4 * this software and associated documentation files (the "Software"), to deal in
5 * the Software without restriction, including without limitation the rights to use,
6 * copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the
7 * Software, and to permit persons to whom the Software is furnished to do so,
8 * subject to the following conditions:
9 *
10 * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
11 *
12 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
13 * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
14 * PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
15 * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
16 * CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE
17 * OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
18 *****************************************************************************************/
19 
20 /************************************************************************
21  * FILE DESCR: Definitions of Agglomerative Hierarchical Clustering module
22  *
23  * CONTENTS:
24  *             cluster
25  *             getProximityMatrix
26  *             setOutputConfig
27  *             setHyperlinkMap
28  *             getClusterResult
29  *             computeProximityMatrix
30  *             computeDistances
31  *             clusterToFindNumClusters
32  *             getInterObjectDistance
33  *             findGroup
34  *             findInterClusterDistance
35  *             writeClustersAsHTML
36  *             determineNumOfClusters
37  *             determineKnee
38  *             findRMSE
39  *             computeAvgSil
40  *
41  *
42  * AUTHOR:     Bharath A
43  *
44  * DATE:       February 22, 2005
45  * CHANGE HISTORY:
46  * Author      Date           Description of change
47  ************************************************************************/
48 #ifndef __LTKHIERARCHICALCLUSTERING_H
49 #define __LTKHIERARCHICALCLUSTERING_H
50 
51 
52 #ifndef _WIN32
53 //#include <values.h>
54 #endif
55 
56 #include "LTKInc.h"
57 #include "LTKTypes.h"
58 #include "LTKLoggerUtil.h"
59 #include "LTKException.h"
60 #include "LTKErrors.h"
61 
62 /*Enumerator for stopping criterion to be used*/
63 enum ELTKHCStoppingCriterion
64 {
65      LMETHOD,
66      AVG_SIL
67 };
68 
69 /*Enumerator for methods in hierarchical clustering*/
70  enum ELTKHCMethod
71  {
72      SINGLE_LINKAGE,
73      COMPLETE_LINKAGE,
74      AVERAGE_LINKAGE
75  };
76 
77 #define OUTPUT_HTML_FILE_NAME "output.html"
78 #define MIN_CUTOFF 20
79 
80 /**
81  * @class LTKHierarchicalClustering
82  * <p> This class does agglomerative hierarchical clustering. The data objects
83         which could be LTKTrace or LTKTraceGroup, are supplied as a vector.
84         Function that defines the distance between two data objects needs to be
85        supplied as a function pointer.One of the 3 methods (Single,Average or
86        Complete linkage) needs to be selected to define the way inter-cluster
87        has to be determined. In case number of clusters is not supplied,
88        it is determined using the L-method (default stopping criterion)<p> */
89 
90 template <class ClusterObjType,class DistanceClass>
91 class LTKHierarchicalClustering
92 {
93 
94      private:
95 
96           //reference to the vector containing the data objects to be clustered
97          const vector<ClusterObjType>& m_data;
98 
99           //triangular matrix containing the pairwise distances between data
100           //objects
101           float2DVector m_proximityMatrix;
102 
103           //data structure that stores current (intermediate) state of the
104           //clusters
105           int2DVector m_intermediateCG;
106 
107           //vector mapping the data object id and path to the data (unipen) file
108           stringVector m_hyperlinksVec;
109 
110           //contains the number of clusters required
111           int m_numOfClusters;
112 
113           //output file handle to write the cluster results as html
114           //with name of the file as OUTPUT_HTML_FILE_NAME
115           ofstream m_output;
116 
117           //flag to indicate whether the output is required as html
118           bool m_writeHTML;
119 
120           //flag to indicate whether to show all levels of the hierarchy in the
121           //html file
122           bool m_showAllLevels;
123 
124           //vector to hold merging distance for each number of clusters
125           floatVector m_mergingDist;
126 
127           //flag for determining number of clusters
128           bool m_determineClusters;
129 
130           //output result directory
131           string m_outputDir;
132 
133           //extension of the image corresponding to each data object in
134           //order to write in the html
135           string m_imageFileExtn;
136 
137           //Method for defining inter-cluster distance
138           ELTKHCMethod m_method;
139 
140            //number of clusters determined by Average Silhouette method
141           int m_numBySil;
142 
143           //cached clustering result corresponding minimum average silhouette
144           int2DVector m_cachedResult;
145 
146           //stopping criterion selected - LMethod or Average Silhouette
147           ELTKHCStoppingCriterion m_stoppingCriterion;
148 
149           //pointer to the class that has the definition of distance function
150           DistanceClass* m_distClassPtr;
151 
152           //function pointer type of the function that defines inter-object distance
153           typedef int (DistanceClass::*FN_PTR_DISTANCE)(const ClusterObjType&,
154                                                       const ClusterObjType&,
155                                                       float&) ;
156 
157           //distance function pointer
158           FN_PTR_DISTANCE m_distancePtr;
159 
160 
161      public:
162 
163 /**********************************************************************************
164 * AUTHOR       : Bharath A
165 * DATE              : 22-FEB-2005
166 * NAME              : LTKHierarchicalClustering
167 * DESCRIPTION  : Constructor to initialise the required parameters
168 * ARGUMENTS         : clusterObjects - vector of data objects which could be LKTrace or
169 *                                            LTKTraceGroup for HWR
170 *                     noOfClusters - Number of clusters required
171 *                     clusteringMethod - One of the 3 methods:
172 *                                             SINGLE_LINKAGE,AVERAGE_LINKAGE
173 *                                             or COMPLETE_LINKAGE
174 *
175 * RETURNS      :
176 * NOTES             :
177 * CHANGE HISTROY
178 * Author            Date                Description of change
179 *************************************************************************************/
180 
181 
182 LTKHierarchicalClustering(const vector<ClusterObjType>& clusterObjects,int noOfClusters,
183                                 ELTKHCMethod clusteringMethod=AVERAGE_LINKAGE) :
m_data(clusterObjects)184                                 m_data(clusterObjects),m_method(clusteringMethod),
185                                 m_numOfClusters(noOfClusters),m_writeHTML(false),
186                                 m_showAllLevels(false),
187                                 m_determineClusters(false)
188 {
189 
190      LOG(LTKLogger::LTK_LOGLEVEL_DEBUG)<<"Entering: "
191           <<"LTKHierarchicalClustering::LTKHierarchicalClustering"
192           <<"(vector<ClusterObjType>,int,ELTKHCMethod)"<<endl;
193 
194      if(m_numOfClusters < 1 || m_numOfClusters>=clusterObjects.size())
195      {
196            LOG(LTKLogger::LTK_LOGLEVEL_ERR)
197                 <<"Number of clusters:"<<m_numOfClusters<<endl;
198 
199            LOG(LTKLogger::LTK_LOGLEVEL_ERR)
200                <<"Error : "<< EINVALID_NUM_CLUSTERS <<":"<< getErrorMessage(EINVALID_NUM_CLUSTERS)
201             <<" LTKHierarchicalClustering::"
202                <<"LTKHierarchicalClustering(vector<ClusterObjType>,int,ELTKHCMethod)"<<endl;
203 
204           throw LTKException(EINVALID_NUM_CLUSTERS);
205      }
206 
207      LOG(LTKLogger::LTK_LOGLEVEL_DEBUG)<<"Exiting: "
208           <<"LTKHierarchicalClustering::LTKHierarchicalClustering"
209           <<"(vector<ClusterObjType>,int,ELTKHCMethod)"<<endl;
210 
211 }
212 
213 
214 /**********************************************************************************
215 * AUTHOR       : Bharath A
216 * DATE              : 22-FEB-2005
217 * NAME              : LTKHierarchicalClustering
218 * DESCRIPTION  : Constructor to initialise the required parameters.
219 *                 Number of clusters is determined by L-method
220 * ARGUMENTS         : clusterObjects - vector of data objects which could be LKTrace or
221 *                                            LTKTraceGroup for HWR
222 *                     clusteringMethod - One of the 3 methods:
223 *                                             SINGLE_LINKAGE,AVERAGE_LINKAGE
224 *                                             or COMPLETE_LINKAGE
225 *                     stoppingCriterion - stopping criterion to determine the
226 *                                              right set of clusters: LMETHOD or AVG_SIL
227 *
228 * RETURNS      :
229 * NOTES             :
230 * CHANGE HISTROY
231 * Author            Date                Description of change
232 *************************************************************************************/
233 
234 
235 LTKHierarchicalClustering(const vector<ClusterObjType>& clusterObjects,
236                                 ELTKHCMethod clusteringMethod=AVERAGE_LINKAGE,
237                                 ELTKHCStoppingCriterion stoppingCriterion=LMETHOD) :
m_data(clusterObjects)238                                 m_data(clusterObjects),
239                                 m_method(clusteringMethod),
240                                 m_stoppingCriterion(stoppingCriterion),
241                                 m_writeHTML(false),
242                                 m_showAllLevels(false),
243                                 m_determineClusters(true)
244 {
245 
246      LOG(LTKLogger::LTK_LOGLEVEL_DEBUG)<<"Entering: "
247           <<"LTKHierarchicalClustering::LTKHierarchicalClustering"
248           <<"(vector<ClusterObjType>,ELTKHCMethod,ELTKHCStoppingCriterion)"<<endl;
249 
250      if(clusterObjects.size()==0)
251      {
252           LOG(LTKLogger::LTK_LOGLEVEL_ERR)
253                <<"Number of elements in clusterObjects vector is zero"<<endl;
254 
255            LOG(LTKLogger::LTK_LOGLEVEL_ERR)
256                <<"Error : "<< ENO_DATA_TO_CLUSTER <<":"<< getErrorMessage(ENO_DATA_TO_CLUSTER)
257             <<" LTKHierarchicalClustering::LTKHierarchicalClustering"
258                <<"(vector<ClusterObjType>,ELTKHCMethod,ELTKHCStoppingCriterion)"<<endl;
259 
260 
261           throw LTKException(ENO_DATA_TO_CLUSTER);
262      }
263 
264      if(clusterObjects.size() < 6 && stoppingCriterion == LMETHOD)
265      {
266           LOG(LTKLogger::LTK_LOGLEVEL_ERR)
267                <<"Number of elements in clusterObjects vector is:"
268                <<clusterObjects.size()<<endl;
269 
270            LOG(LTKLogger::LTK_LOGLEVEL_ERR)
271                <<"Error : "<< EINSUFFICIENT_DATA_FOR_LMETHOD
272                <<":"<< getErrorMessage(EINSUFFICIENT_DATA_FOR_LMETHOD)
273             <<" LTKHierarchicalClustering::LTKHierarchicalClustering"
274                <<"(vector<ClusterObjType>,ELTKHCMethod,ELTKHCStoppingCriterion)"<<endl;
275 
276           throw LTKException(EINSUFFICIENT_DATA_FOR_LMETHOD);
277      }
278 
279      LOG(LTKLogger::LTK_LOGLEVEL_DEBUG)<<"Exiting: "
280           <<"LTKHierarchicalClustering::LTKHierarchicalClustering"
281           <<"(vector<ClusterObjType>,ELTKHCMethod,ELTKHCStoppingCriterion)"<<endl;
282 
283 }
284 
285 
286 
287 /**********************************************************************************
288 * AUTHOR       : Bharath A
289 * DATE              : 22-FEB-2005
290 * NAME              : cluster
291 * DESCRIPTION  : Clusters the input data objects. The number of clusters is determined
292 *                     based on the stopping criterion or as specified by the user.
293 * ARGUMENTS         : distanceClassPtr - pointer to the class that contains the distance
294 *                                             function defintion
295 *                     distFuncPtr - distance function pointer
296 * RETURNS      : error code
297 * NOTES             :
298 * CHANGE HISTROY
299 * Author            Date                Description of change
300 *************************************************************************************/
cluster(DistanceClass * distanceClassPtr,FN_PTR_DISTANCE distFuncPtr)301 int cluster(DistanceClass* distanceClassPtr,FN_PTR_DISTANCE distFuncPtr)
302 {
303 
304      LOG(LTKLogger::LTK_LOGLEVEL_DEBUG)<<"Entering: "
305           <<"LTKHierarchicalClustering::cluster()"<<endl;
306 
307 
308      m_distancePtr=distFuncPtr;
309 
310      m_distClassPtr=distanceClassPtr;
311 
312 
313      //To compute inter-object distances
314      int errorCode = computeDistances();
315 
316      if (errorCode != SUCCESS )
317      {
318           LOG(LTKLogger::LTK_LOGLEVEL_ERR)
319         <<"Error: LTKHierarchicalClustering::cluster()"<<endl;
320 
321           LTKReturnError(errorCode)
322      }
323 
324      //if the user has specified the number of clusters
325      if(!m_determineClusters)
326      {
327            clusterToFindNumClusters();
328      }
329      else
330      {
331                m_numOfClusters=1;
332 
333                //clustering to determine the number of
334                //clusters vs merging distance curve
335                clusterToFindNumClusters();
336 
337                m_determineClusters=false;
338 
339                if(m_stoppingCriterion==LMETHOD)
340                {
341                     //Number of clusters determined by L-method
342                    m_numOfClusters=determineNumOfClusters();
343 
344                     LOG(LTKLogger::LTK_LOGLEVEL_DEBUG)
345                          <<"Number of clusters determined using L-Method"
346                          <<m_numOfClusters<<endl;
347 
348                }
349                else if(m_stoppingCriterion==AVG_SIL)
350                {
351 
352                     //Number of clusters determined by silhouette method
353                    m_numOfClusters=m_numBySil;
354 
355                     LOG(LTKLogger::LTK_LOGLEVEL_DEBUG)
356                          <<"Number of clusters determined using Average Silhouette method"
357                          <<m_numOfClusters<<endl;
358 
359                }
360 
361                //clearing intermediate clusters formed during evaluation
362                //of the stopping criterion
363                m_intermediateCG.clear();
364 
365                //clustering to the number of clusters determined
366                //by the stopping criterion
367                clusterToFindNumClusters();
368 
369        }
370 
371 
372      LOG(LTKLogger::LTK_LOGLEVEL_DEBUG)<<"Exiting: "
373           <<"LTKHierarchicalClustering::cluster()"<<endl;
374       return SUCCESS;
375 
376 }
377 
378 
379 
380 /**********************************************************************************
381 * AUTHOR       : Dinesh M
382 * DATE              : 23-Jan-2006
383 * NAME              : getProximityMatrix
384 * DESCRIPTION  : returns the distance matrix
385 * ARGUMENTS         :
386 * RETURNS      : proximity matrix (float2DVector)
387 * NOTES             :
388 * CHANGE HISTROY
389 * Author            Date                Description of change
390 *************************************************************************************/
391 
getProximityMatrix()392 const float2DVector& getProximityMatrix() const
393 {
394 
395      LOG(LTKLogger::LTK_LOGLEVEL_DEBUG)<<"Entering: "
396           <<"LTKHierarchicalClustering::getProximityMatrix()"<<endl;
397 
398 
399      LOG(LTKLogger::LTK_LOGLEVEL_DEBUG)<<"Exiting: "
400           <<"LTKHierarchicalClustering::getProximityMatrix()"<<endl;
401 
402      return m_proximityMatrix;
403 }
404 
405 
406 /**********************************************************************************
407 * AUTHOR       : Bharath A
408 * DATE              : 22-FEB-2005
409 * NAME              : setOutputConfig
410 * DESCRIPTION  : This function sets the configuration for the output of clustering.
411 * ARGUMENTS         : outputDirectory - path to the directory where output html
412 *                                            is to be generated
413 *                     displayAllLevels - flag to indicate whether all levels in the
414 *                                             clustering need to written to the file
415 *                     imageFileExtension - extension of the image file (ex)"png".
416 *                                            If not specified <img> tag in the output html is not created.
417 * RETURNS      : error code
418 * NOTES             :
419 * CHANGE HISTROY
420 * Author            Date                Description of change
421 *************************************************************************************/
422 //int setOutputConfig(const string& outputDirectory,
423 //                              bool displayAllLevels=false,
424 //                              string imageFileExtension="")
425 int setOutputConfig(const string& outputDirectory,bool displayAllLevels=false,
426                          const string& imageFileExtension="")
427 {
428 
429      LOG(LTKLogger::LTK_LOGLEVEL_DEBUG)<<"Entering: "
430           <<"LTKHierarchicalClustering::setOutputConfig()"<<endl;
431 
432      m_writeHTML=true;
433 
434      m_showAllLevels=displayAllLevels;
435 
436      string tempOutputFilePath=outputDirectory+"/"+OUTPUT_HTML_FILE_NAME;
437 
438      m_output.open(tempOutputFilePath.c_str());
439 
440      if(m_output.fail())
441      {
442 
443           LOG(LTKLogger::LTK_LOGLEVEL_ERR)
444                <<"Unable to create file:"
445                <<tempOutputFilePath<<endl;
446 
447           LOG(LTKLogger::LTK_LOGLEVEL_ERR)
448                <<"Error : "<< EFILE_CREATION_FAILED <<":"
449                <<getErrorMessage(EFILE_CREATION_FAILED)
450             <<" LTKHierarchicalClustering::setOutputConfig()" <<endl;
451 
452           LTKReturnError(EFILE_CREATION_FAILED)
453      }
454 
455      m_outputDir=outputDirectory;
456 
457      LOG(LTKLogger::LTK_LOGLEVEL_DEBUG)
458           <<"Clustering results output directory:"
459           <<m_outputDir<<endl;
460 
461      m_output.close();
462 
463      //If it takes the default value, <img> tag in the output is not generated
464      m_imageFileExtn=imageFileExtension;
465 
466      LOG(LTKLogger::LTK_LOGLEVEL_DEBUG)
467           <<"Image file extension:"<<m_imageFileExtn<<endl;
468 
469      LOG(LTKLogger::LTK_LOGLEVEL_DEBUG)<<"Exiting: "
470           <<"LTKHierarchicalClustering::setOutputConfig()"<<endl;
471 
472      return SUCCESS;
473 }
474 
475 
476 /**********************************************************************************
477 * AUTHOR       : Bharath A
478 * DATE              : 22-FEB-2005
479 * NAME              : setHyperlinkMap
480 * DESCRIPTION  : To set hyperlinks for each data object which refers to actual data file.
481 *                     Assumes one-to-one correspondence with the data vector
482 *                     passed in the constructor.
483 * ARGUMENTS         : hyperlinksVector - Vector containing paths to physical
484 *                                             files of each data object.
485 *
486 * RETURNS      : error code
487 * NOTES             :
488 * CHANGE HISTROY
489 * Author            Date                Description of change
490 *************************************************************************************/
491 //int setHyperlinkMap(const vector<string>& hyperlinksVector)
setHyperlinkMap(const vector<string> & hyperlinksVector)492 int setHyperlinkMap(const vector<string>& hyperlinksVector)
493 {
494 
495      LOG(LTKLogger::LTK_LOGLEVEL_DEBUG)<<"Entering: "
496           <<"LTKHierarchicalClustering::setHyperlinkMap()"<<endl;
497 
498      if(m_data.size()!=hyperlinksVector.size())
499      {
500           LOG(LTKLogger::LTK_LOGLEVEL_ERR)
501                <<"cluster objects vector size:"<<m_data.size()
502                <<" and hyperlinks vector size:"<<hyperlinksVector.size()<<endl;
503 
504           LOG(LTKLogger::LTK_LOGLEVEL_ERR)
505                <<"Error : "<< EDATA_HYPERLINK_VEC_SIZE_MISMATCH
506                <<":"<< getErrorMessage(EDATA_HYPERLINK_VEC_SIZE_MISMATCH)
507             <<" LTKHierarchicalClustering::setHyperlinkMap()" <<endl;
508 
509           LTKReturnError(EDATA_HYPERLINK_VEC_SIZE_MISMATCH);
510      }
511 
512      m_hyperlinksVec=hyperlinksVector; //Vector for hyperlinks is set
513 
514 
515      LOG(LTKLogger::LTK_LOGLEVEL_DEBUG)<<"Exiting: "
516           <<"LTKHierarchicalClustering::setHyperlinkMap()"<<endl;
517 
518      return SUCCESS;
519 
520 }
521 
522 
523 /**********************************************************************************
524 * AUTHOR       : Bharath A
525 * DATE              : 22-FEB-2005
526 * NAME              : getClusterResult
527 * DESCRIPTION  : Populates the argument (vector of vectors) with data objects indices.
528 *                     Each row (inner vector) corresponds to a cluster.
529 * ARGUMENTS         : outClusterResult - reference to result vector of vectors.
530 *
531 * RETURNS      :
532 * NOTES             :
533 * CHANGE HISTROY
534 * Author            Date                Description of change
535 *************************************************************************************/
getClusterResult(vector<vector<int>> & outClusterResult)536 void getClusterResult(vector<vector<int> >& outClusterResult) const
537 {
538      LOG(LTKLogger::LTK_LOGLEVEL_DEBUG)<<"Entering: "
539           <<"LTKHierarchicalClustering::getClusterResult()"<<endl;
540 
541      for(int v=0;v<m_intermediateCG.size();v++)
542      {
543 
544           outClusterResult.push_back(m_intermediateCG[v]);
545      }
546 
547      LOG(LTKLogger::LTK_LOGLEVEL_DEBUG)<<"Exiting: "
548           <<"LTKHierarchicalClustering::getClusterResult()"<<endl;
549 }
550 
551 /**********************************************************************************
552 * AUTHOR       : Bharath A
553 * DATE              : 22-FEB-2005
554 * NAME              : computeProximityMatrix
555 * DESCRIPTION  : Populates the argument (vector of vectors) with data objects indices.
556 *                     Each inner vector corresponds to a cluster.
557 * ARGUMENTS         : distanceClassPtr - pointer to the class that has the distance definition
558 *                     distFuncPtr - function pointer to the distance function
559 * RETURNS      : error code
560 * NOTES             :
561 * CHANGE HISTROY
562 * Author            Date                Description of change
563 *************************************************************************************/
564 
565 //void computeProximityMatrix(ComputeDistanceFunc computeDistFuncObj)
computeProximityMatrix(DistanceClass * distanceClassPtr,FN_PTR_DISTANCE distFuncPtr)566 int computeProximityMatrix(DistanceClass* distanceClassPtr,
567                                  FN_PTR_DISTANCE distFuncPtr)
568 {
569      LOG(LTKLogger::LTK_LOGLEVEL_DEBUG)<<"Entering: "
570           <<"LTKHierarchicalClustering::computeProximityMatrix()"<<endl;
571 
572      m_distancePtr=distFuncPtr;
573      m_distClassPtr=distanceClassPtr;
574 
575      int errorCode;
576 
577      if((errorCode=computeDistances())!=SUCCESS)
578      {
579            LOG(LTKLogger::LTK_LOGLEVEL_ERR)
580                <<"Error: LTKHierarchicalClustering::computeProximityMatrix()"<<endl;
581 
582           LTKReturnError(errorCode);
583      }
584 
585      LOG(LTKLogger::LTK_LOGLEVEL_DEBUG)<<"Exiting: "
586           <<"LTKHierarchicalClustering::computeProximityMatrix()"<<endl;
587 
588      return SUCCESS;
589 }
590 
591 
592 
593 private:
594 
595 /**********************************************************************************
596 * AUTHOR       : Bharath A
597 * DATE              : 22-FEB-2005
598 * NAME              : computeDistances
599 * DESCRIPTION  : Computes inter-object distances and puts them in the distance matrix
600 * ARGUMENTS         :
601 * RETURNS      : error code
602 * NOTES             :
603 * CHANGE HISTROY
604 * Author            Date                Description of change
605 *************************************************************************************/
606 
computeDistances()607 int computeDistances()
608 {
609 
610      LOG(LTKLogger::LTK_LOGLEVEL_DEBUG)<<"Entering: "
611           <<"LTKHierarchicalClustering::computeDistances()"<<endl;
612 
613      for(int i=0;i<(m_data.size()-1);++i)
614      {
615           vector<float> eachRow((m_data.size()-i)-1);//added -1 at the end
616 
617           int c=0;
618 
619           for(int j=i+1;j<m_data.size();++j)
620           {
621                //external distance function called
622                int errorCode = (m_distClassPtr->*m_distancePtr)(m_data[i],m_data[j], eachRow[c]);
623 
624                if (errorCode != SUCCESS )
625                {
626                     LOG(LTKLogger::LTK_LOGLEVEL_ERR)
627                          <<"Error while calling distance function"<<endl;
628 
629                     LOG(LTKLogger::LTK_LOGLEVEL_ERR)
630                          <<"Error: LTKHierarchicalClustering::computeDistances()"<<endl;
631 
632                     LTKReturnError(errorCode);
633                }
634 
635                ++c;
636 
637           }
638 
639           m_proximityMatrix.push_back(eachRow);
640 
641      }
642 
643      LOG(LTKLogger::LTK_LOGLEVEL_DEBUG)<<"Exiting: "
644           <<"LTKHierarchicalClustering::computeDistances()"<<endl;
645 
646     return SUCCESS;
647 
648 }
649 
650 
651 
652 /**********************************************************************************
653 * AUTHOR       : Bharath A
654 * DATE              : 22-FEB-2005
655 * NAME              : clusterToFindNumClusters
656 * DESCRIPTION  : Clusters the data objects hierarchically (agglomerative)
657 *                     till the desired number of
658 *                     clusters and also evaluates the stopping criterion selected.
659 * ARGUMENTS         :
660 * RETURNS      : error code
661 * NOTES             :
662 * CHANGE HISTROY
663 * Author            Date                Description of change
664 *************************************************************************************/
665 
clusterToFindNumClusters()666 int clusterToFindNumClusters()
667 {
668 
669      LOG(LTKLogger::LTK_LOGLEVEL_DEBUG)<<"Entering: "
670           <<"LTKHierarchicalClustering::clusterToFindNumClusters()"<<endl;
671 
672    if(m_stoppingCriterion==LMETHOD)
673    {
674           //Number of clusters needs to be determined
675           if(m_determineClusters)
676           {
677                //map for number of clusters vs. merging distance
678                m_mergingDist.reserve(m_data.size());
679           }
680    }
681    else if(m_stoppingCriterion==AVG_SIL)
682    {
683           if(m_writeHTML==false && m_cachedResult.size()>0)
684           {
685                m_intermediateCG=m_cachedResult;
686 
687                return SUCCESS;
688           }
689    }
690 
691      for(int i=0;i<m_data.size();i++)
692      {
693           vector<int> v;
694           v.push_back(i);
695 
696           //To begin with, each data object is cluster by itself
697           m_intermediateCG.push_back(v);
698      }
699 
700      if(m_writeHTML)  //If output is needed as html
701      {
702           string outputFilePath=m_outputDir+"/"+OUTPUT_HTML_FILE_NAME;
703 
704           m_output.open(outputFilePath.c_str()); //Cluster output file is created
705 
706           if(m_output.fail())
707           {
708                LOG(LTKLogger::LTK_LOGLEVEL_ERR)
709                     <<"Unable to create file:"
710                     <<outputFilePath<<endl;
711 
712                LOG(LTKLogger::LTK_LOGLEVEL_ERR)
713                     <<"Error : "<< EFILE_CREATION_FAILED
714                     <<":"<< getErrorMessage(EFILE_CREATION_FAILED)
715                 <<" LTKHierarchicalClustering::clusterToFindNumClusters()" <<endl;
716 
717                LTKReturnError(EFILE_CREATION_FAILED);
718           }
719 
720           /*Html tags are written*/
721           m_output<<"<html>\n";
722           m_output<<"<body>\n";
723           m_output<<"<table border='1' bordercolor='black'>\n";
724 
725 
726           m_output<<"<tr>\n";
727 
728 
729           for(int v=0;v<m_intermediateCG.size();v++)
730           {
731 
732                int clusterSize=m_intermediateCG[v].size();
733 
734                m_output<<"<td colspan=\""<<clusterSize<<"\">";
735 
736                for(int w=0;w<clusterSize;w++)
737                {
738                     if(m_hyperlinksVec.size()>0)
739                     {
740                          m_output<<"<a href='"
741                                    <<m_hyperlinksVec[m_intermediateCG[v][w]]
742                                    <<"'>"<<m_intermediateCG[v][w]<<"</a>&nbsp;";
743                     }
744                     else
745                     {
746                          m_output<<m_intermediateCG[v][w]<<"&nbsp;";
747                     }
748 
749                     //if there is an image file corresponding to each data object
750                     if(!m_imageFileExtn.empty())
751                     {
752                          m_output<<"<img src=\""
753                                    <<m_intermediateCG[v][w]<<"."<<m_imageFileExtn
754                                    <<"\" border=\"0\"/>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;";
755                     }
756 
757 
758                }
759           }
760 
761           m_output<<"<td><b>";
762           m_output<<"Inter-cluster Dist";
763           m_output<<"</b></td>";
764 
765           m_output<<"</tr>\n";
766 
767      }
768 
769      if(m_numOfClusters<m_data.size() || m_determineClusters==true)
770      {
771 
772           int currNumOfClusters = m_data.size();
773 
774           //this local variable is used only for Average Silhouette method
775           float minSil=FLT_MAX;
776 
777 
778           for(int it=0;it<(m_data.size()-m_numOfClusters);++it)
779           {
780                vector<int> toCluster;
781 
782                //to find the clusters that need to be merged
783                float interClusterDistance=findGroup(toCluster);
784 
785                currNumOfClusters=m_data.size()-it-1;
786 
787                if(m_stoppingCriterion==AVG_SIL)
788                {
789                     float silDiff=computeAvgSil(toCluster[0],toCluster[1]);
790 
791                     if(silDiff<minSil)
792                     {
793                          minSil=silDiff;
794                          if(currNumOfClusters > 2)
795                          {
796                               m_numBySil=currNumOfClusters+1;
797                               m_cachedResult=m_intermediateCG;
798                          }
799 
800                     }
801                }
802                else if(m_stoppingCriterion==LMETHOD && m_determineClusters==true)
803                {
804                          m_mergingDist[currNumOfClusters]=interClusterDistance;
805                }
806 
807 
808                //clusters are merged
809                m_intermediateCG[toCluster[0]].insert(m_intermediateCG[toCluster[0]].end(),
810                                                               m_intermediateCG[toCluster[1]].begin(),
811                                                               m_intermediateCG[toCluster[1]].end());
812 
813                //old cluster deleted
814                m_intermediateCG.erase(m_intermediateCG.begin()+ toCluster[1]);
815 
816 
817 
818                     if(m_writeHTML)
819                     {
820                          if(!m_showAllLevels)
821                          {
822                               if(currNumOfClusters==m_numOfClusters)
823                               {
824                                    writeClustersAsHTML(interClusterDistance);
825                               }
826                          }
827                          else
828                          {
829                               writeClustersAsHTML(interClusterDistance);
830                          }
831 
832                     }
833 
834 
835           }
836 
837 
838      }
839 
840 
841      //closing the html tags
842      if(m_writeHTML)
843      {
844           m_output<<"</table>\n";
845           m_output<<"</body>\n";
846           m_output<<"</html>";
847 
848           m_output.close();
849      }
850 
851      LOG(LTKLogger::LTK_LOGLEVEL_DEBUG)<<"Exiting: "
852           <<"LTKHierarchicalClustering::clusterToFindNumClusters()"<<endl;
853 
854      return SUCCESS;
855 
856 }
857 
858 
859 
860 
861 
862 
863 
864 /**********************************************************************************
865 * AUTHOR       : Bharath A
866 * DATE              : 22-FEB-2005
867 * NAME              : getInterObjectDistance
868 * DESCRIPTION  : Returns the distance between two data objects from the distance matrix
869 * ARGUMENTS         : firstObjIndex - index of the first data object
870 *                     secondObjIndex -  index of the second data object
871 * RETURNS      : distance (float)
872 * NOTES             :
873 * CHANGE HISTROY
874 * Author            Date                Description of change
875 *************************************************************************************/
getInterObjectDistance(int firstObjIndex,int secondObjIndex)876 float getInterObjectDistance(int firstObjIndex,int secondObjIndex) const
877 {
878      LOG(LTKLogger::LTK_LOGLEVEL_DEBUG)<<"Entering: "
879           <<"LTKHierarchicalClustering::getInterObjectDistance()"<<endl;
880 
881      int row = 0;
882      int col = 0;
883 
884      if(firstObjIndex < secondObjIndex)
885      {
886           row=firstObjIndex;
887           col=secondObjIndex;
888      }
889      else
890      {
891           row=secondObjIndex;
892           col=firstObjIndex;
893      } //lesser index is made as row and the other as column
894 
895      LOG(LTKLogger::LTK_LOGLEVEL_DEBUG)<<"Exiting: "
896           <<"LTKHierarchicalClustering::getInterObjectDistance()"<<endl;
897 
898      return m_proximityMatrix[row][col-row-1];
899 }
900 
901 /**********************************************************************************
902 * AUTHOR       : Bharath A
903 * DATE              : 22-FEB-2005
904 * NAME              : findGroup
905 * DESCRIPTION  : Finds the indices in the m_intermediateCG (clusters) that
906 *                     need to be merged
907 * ARGUMENTS         : pairToCombine - vector for storing the cluster indices
908 * RETURNS      : inter cluster distance (float)
909 * NOTES             :
910 * CHANGE HISTROY
911 * Author            Date                Description of change
912 *************************************************************************************/
913 //double findGroup(vector<int> pairToCombine)
findGroup(vector<int> & pairToCombine)914 float findGroup(vector<int>& pairToCombine) const
915 {
916      LOG(LTKLogger::LTK_LOGLEVEL_DEBUG)<<"Entering: "
917           <<"LTKHierarchicalClustering::findGroup()"<<endl;
918 
919      float minDistance=FLT_MAX;
920 
921      pairToCombine.clear();
922 
923      pairToCombine.resize(2);
924 
925      for(int i=0;i<m_intermediateCG.size();i++)
926      {
927           for(int j=i+1;j<m_intermediateCG.size();j++)
928           {
929                float tempDist=findInterClusterDistance(m_intermediateCG[i],
930                                                                   m_intermediateCG[j]);
931 
932                if(tempDist<minDistance)
933                {
934 
935                     minDistance=tempDist;
936                     pairToCombine[0]=i;
937                     pairToCombine[1]=j;
938                }
939           }
940 
941      }
942 
943      LOG(LTKLogger::LTK_LOGLEVEL_DEBUG)<<"Exiting: "
944           <<"LTKHierarchicalClustering::findGroup()"<<endl;
945 
946 
947      return minDistance;
948 
949 }
950 
951 
952 /**********************************************************************************
953 * AUTHOR       : Bharath A
954 * DATE              : 22-FEB-2005
955 * NAME              : findInterClusterDistance
956 * DESCRIPTION  : Finds the inter-cluster distance.
957 *                     The contents of each cluster are in the vectors.
958 * ARGUMENTS         : v1 cluster one
959 *                     v2 cluster two
960 * RETURNS      : inter cluster distance
961 * NOTES             :
962 * CHANGE HISTROY
963 * Author            Date                Description of change
964 *************************************************************************************/
965 
966 //double findInterClusterDistance(const vector<int>& v1,const vector<int>& v2)
findInterClusterDistance(const vector<int> & cluster1,const vector<int> & cluster2)967 float findInterClusterDistance(const vector<int>& cluster1, const vector<int>& cluster2) const
968 {
969      LOG(LTKLogger::LTK_LOGLEVEL_DEBUG)<<"Entering: "
970           <<"LTKHierarchicalClustering::findInterClusterDistance()"<<endl;
971 
972      float groupDistance=0.0;
973 
974      /*For single-linkage algorithm*/
975      if(m_method==SINGLE_LINKAGE)
976      {
977           groupDistance= FLT_MAX;
978 
979           for(int i=0;i<cluster1.size();i++)
980           {
981                for(int j=0;j<cluster2.size();j++)
982                {
983                     float temp=getInterObjectDistance(cluster1[i],cluster2[j]);
984 
985                     if(temp < groupDistance)
986                     {
987                          groupDistance=temp;
988                     }
989                }
990           }
991      }
992 
993 
994      /*For average-linkage algorithm*/
995      if(m_method==AVERAGE_LINKAGE)
996      {
997           groupDistance=0.0;
998 
999           for(int i=0;i<cluster1.size();i++)
1000           {
1001                for(int j=0;j<cluster2.size();j++)
1002                {
1003 
1004                     groupDistance+=getInterObjectDistance(cluster1[i],cluster2[j]);
1005                }
1006           }
1007 
1008           groupDistance/=((float)(cluster1.size()*cluster2.size()));
1009      }
1010 
1011 
1012      /*For complete-linkage algorithm*/
1013      if(m_method==COMPLETE_LINKAGE)
1014      {
1015           groupDistance=0.0;
1016 
1017           for(int i=0;i<cluster1.size();i++)
1018           {
1019                for(int j=0;j<cluster2.size();j++)
1020                {
1021                     float temp=getInterObjectDistance(cluster1[i],cluster2[j]);
1022 
1023                     if(temp > groupDistance)
1024                     {
1025                          groupDistance=temp;
1026                     }
1027                }
1028           }
1029      }
1030 
1031      LOG(LTKLogger::LTK_LOGLEVEL_DEBUG)<<"Exiting: "
1032           <<"LTKHierarchicalClustering::findInterClusterDistance()"<<endl;
1033 
1034 
1035      return groupDistance;
1036 
1037 
1038 }
1039 
1040 /**********************************************************************************
1041 * AUTHOR       : Bharath A
1042 * DATE              : 22-FEB-2005
1043 * NAME              : writeClustersAsHTML
1044 * DESCRIPTION  : Writes the cluster results as html with data objects' and clusters' ids.
1045 *                     If hyperlinks vector is set, provides links to actual files.
1046 * ARGUMENTS         : interClustDist - merging distance of the new cluster formed
1047 * RETURNS      :
1048 * NOTES             :
1049 * CHANGE HISTROY
1050 * Author            Date                Description of change
1051 *************************************************************************************/
1052 
writeClustersAsHTML(float interClustDist)1053 void writeClustersAsHTML(float interClustDist)
1054 {
1055 
1056      LOG(LTKLogger::LTK_LOGLEVEL_DEBUG)<<"Entering: "
1057           <<"LTKHierarchicalClustering::writeClustersAsHTML()"<<endl;
1058 
1059      m_output<<"<tr>\n";
1060 
1061      for(int v=0;v<m_intermediateCG.size();v++)
1062      {
1063           int clusterSize=m_intermediateCG[v].size();
1064 
1065           m_output<<"<td colspan=\""<<clusterSize<<"\">";
1066 
1067           m_output<<"("<<v<<")<br>";
1068 
1069           for(int w=0;w<clusterSize;w++)
1070           {
1071                if(m_hyperlinksVec.size()>0)
1072                {
1073                     m_output<<"<a href='"<<m_hyperlinksVec[m_intermediateCG[v][w]]
1074                             <<"'>"<<m_intermediateCG[v][w]<<"</a>&nbsp;";
1075                }
1076                else
1077                {
1078                     m_output<<m_intermediateCG[v][w]<<"&nbsp;";
1079                }
1080                if(!m_imageFileExtn.empty())
1081                {
1082                     m_output<<"<img src=\""<<m_intermediateCG[v][w]<<"."
1083                             <<m_imageFileExtn
1084                             <<"\" border=\"0\"/>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;";
1085                }
1086 
1087           }
1088      }
1089 
1090      m_output<<"<td>";
1091 
1092      m_output<<"("<<m_intermediateCG.size()
1093                <<")&nbsp;&nbsp;&nbsp;<b>"<<interClustDist<<"</b>";
1094 
1095      m_output<<"</td>";
1096 
1097      m_output<<"</tr>\n";
1098 
1099      LOG(LTKLogger::LTK_LOGLEVEL_DEBUG)<<"Exiting: "
1100           <<"LTKHierarchicalClustering::writeClustersAsHTML()"<<endl;
1101 
1102 }
1103 
1104 /**********************************************************************************
1105 * AUTHOR       : Bharath A
1106 * DATE              : 22-FEB-2005
1107 * NAME              : determineNumOfClusters
1108 * DESCRIPTION  : Determines the number of clusters to be formed using
1109 *                     iterative refinement of L-method.
1110 *                     REFERENCE:S. Salvador and P. Chan. Determining the number of clusters/
1111 *                     segments in hierarchical clustering/segmentation algorithms.
1112 *                     Proceedings of 16th IEEE International Conference
1113 *                     on Tools with Artificial Intelligence, 3:1852-1857, 2004.
1114 * ARGUMENTS         :
1115 * RETURNS      : number of clusters
1116 * NOTES             :
1117 * CHANGE HISTROY
1118 * Author            Date                Description of change
1119 *************************************************************************************/
1120 
determineNumOfClusters()1121 int determineNumOfClusters() const
1122 {
1123      LOG(LTKLogger::LTK_LOGLEVEL_DEBUG)<<"Entering: "
1124           <<"LTKHierarchicalClustering::determineNumOfClusters()"<<endl;
1125 
1126      int cutOff=(int)m_mergingDist.capacity()-1;
1127 
1128      int lastKnee=0;
1129      int currentKnee=0;
1130 
1131      lastKnee=cutOff;
1132 
1133      currentKnee=lastKnee;
1134 
1135      bool trueCutOff=false;
1136 
1137      /*Iterative refinement of the L-method result*/
1138 
1139      while(true)
1140      {
1141 
1142           lastKnee=currentKnee;
1143 
1144           currentKnee=determineKnee(cutOff);
1145 
1146           if(trueCutOff)
1147           {
1148 
1149                if(currentKnee >= lastKnee)
1150                {
1151 
1152                     break;
1153                }
1154           }
1155 
1156           int temp=currentKnee*2;
1157 
1158           if(temp>cutOff)
1159           {
1160                --cutOff;
1161 
1162                trueCutOff=false;
1163           }
1164           else
1165           {
1166                cutOff=temp;
1167 
1168                trueCutOff=true;
1169           }
1170 
1171 
1172           if(cutOff < MIN_CUTOFF)
1173           {
1174 
1175                break;
1176           }
1177 
1178 
1179      }
1180 
1181      LOG(LTKLogger::LTK_LOGLEVEL_DEBUG)
1182           <<"Number of clusters determined by iterative refinement:"
1183           <<currentKnee<<endl;
1184 
1185      LOG(LTKLogger::LTK_LOGLEVEL_DEBUG)<<"Exiting: "
1186           <<"LTKHierarchicalClustering::determineNumOfClusters()"<<endl;
1187 
1188      return currentKnee;
1189 }
1190 
1191 
1192 
1193 
1194 /**********************************************************************************
1195 * AUTHOR       : Bharath A
1196 * DATE              : 22-FEB-2005
1197 * NAME              : determineKnee
1198 * DESCRIPTION  : Determines the knee of the given number of clusters vs. merging distance curve
1199 * ARGUMENTS         : maxNumClusters - maximum number of clusters
1200 * RETURNS      : knee point of the curve
1201 * NOTES             :
1202 * CHANGE HISTROY
1203 * Author            Date                Description of change
1204 *************************************************************************************/
determineKnee(int maxNumClusters)1205 int determineKnee(int maxNumClusters) const
1206 {
1207      LOG(LTKLogger::LTK_LOGLEVEL_DEBUG)<<"Entering: "
1208           <<"LTKHierarchicalClustering::determineKnee()"<<endl;
1209 
1210      int result=0;
1211 
1212      float minRMSE = FLT_MAX;
1213 
1214      //atleast two points are required to fit lines, hence c ranges
1215      //from 3 to maxNumClusters-2
1216      for(int c=3;c<maxNumClusters-2;c++)
1217      {
1218 
1219           float lRMSE=0,rRMSE=0;
1220 
1221           findRMSE(c,maxNumClusters,lRMSE,rRMSE);
1222 
1223           float     cRMSE=(((float)(c-1)/(float)(maxNumClusters-1))*lRMSE)
1224                            +(((float)(maxNumClusters-c)/(float)(maxNumClusters-1))*rRMSE);
1225 
1226           if(cRMSE<minRMSE)
1227           {
1228 
1229                minRMSE=cRMSE;
1230 
1231                result=c;
1232           }
1233      }
1234 
1235      LOG(LTKLogger::LTK_LOGLEVEL_DEBUG)<<"Exiting: "
1236           <<"LTKHierarchicalClustering::determineKnee()"<<endl;
1237 
1238      return result+1;
1239 }
1240 
1241 
1242 
1243 
1244 /**********************************************************************************
1245 * AUTHOR       : Bharath A
1246 * DATE              : 22-FEB-2005
1247 * NAME              : determineKnee
1248 * DESCRIPTION  : Determines left and right RMSE values of the given point
1249 *                     on the no. of clusters vs. merging distance curve.
1250 *                     It fits two regression lines on either side of the point to find RMSEs
1251 * ARGUMENTS         : candidateKnee - candidata knee point
1252 *                     maxNumClusters - upper bound on number of clusters
1253 *                     lRMSE - output RMSE on the left of c
1254 *                     rRMSE - output RMSE on the right of c
1255 * RETURNS      :
1256 * NOTES             :
1257 * CHANGE HISTROY
1258 * Author            Date                Description of change
1259 *************************************************************************************/
findRMSE(int candidateKnee,int maxNumClusters,float & outLRMSE,float & outRRMSE)1260 void findRMSE(int candidateKnee,int maxNumClusters,float& outLRMSE,float& outRRMSE) const
1261 {
1262      LOG(LTKLogger::LTK_LOGLEVEL_DEBUG)<<"Entering: "
1263           <<"LTKHierarchicalClustering::findRMSE()"<<endl;
1264 
1265      float avgXLeft = 0;
1266      float avgYLeft = 0;
1267      float avgXRight = 0;
1268      float avgYRight = 0;
1269 
1270      /*Regression line coefficients*/
1271      float beta0Left = 0;
1272      float beta1Left = 0;
1273      float beta0Right = 0;
1274      float beta1Right = 0;
1275 
1276      int i = 0;
1277      int j = 0;
1278 
1279      for(i=2;i<=candidateKnee;i++)
1280      {
1281           avgYLeft+=m_mergingDist[i];
1282 
1283           avgXLeft+=i;
1284      }
1285 
1286 
1287      avgYLeft /= (candidateKnee-1);
1288      avgXLeft /= (candidateKnee-1);
1289 
1290 
1291      for(j=candidateKnee+1;j<=maxNumClusters;j++)
1292      {
1293 
1294           avgYRight += m_mergingDist[j];
1295           avgXRight += j;
1296      }
1297 
1298      avgYRight /= (maxNumClusters-candidateKnee);
1299      avgXRight /= (maxNumClusters-candidateKnee);
1300 
1301      float numer=0;
1302      float denom=0;
1303 
1304      for(i=2;i<=candidateKnee;i++)
1305      {
1306 
1307           numer += ((i-avgXLeft)*(m_mergingDist[i]-avgYLeft));
1308           denom += ((i-avgXLeft)*(i-avgXLeft));
1309      }
1310 
1311      beta1Left = numer/denom;
1312      beta0Left = avgYLeft-(beta1Left*avgXLeft);
1313 
1314      numer=0;denom=0;
1315 
1316      for(j=candidateKnee+1;j<=maxNumClusters;j++)
1317      {
1318           numer += ((j-avgXRight)*(m_mergingDist[j]-avgYRight));
1319           denom += ((j-avgXRight)*(j-avgXRight));
1320      }
1321 
1322     if(denom > EPS)
1323     {
1324           beta1Right = numer / denom;
1325      }
1326      else
1327      {
1328           beta1Right = 0;
1329      }
1330 
1331      beta0Right = avgYRight-(beta1Right*avgXRight);
1332 
1333      float errorSOS=0;
1334 
1335      for(i=2;i<=candidateKnee;i++)
1336      {
1337 
1338           float yCap = (beta0Left+(beta1Left*i));
1339 
1340           errorSOS += ((m_mergingDist[i]-yCap)*(m_mergingDist[i]-yCap));
1341 
1342      }
1343 
1344      outLRMSE=sqrt(errorSOS/(candidateKnee-2));
1345 
1346      errorSOS=0;
1347 
1348      for(j=candidateKnee+1;j<=maxNumClusters;j++)
1349      {
1350 
1351           float yCap = beta0Right + (beta1Right*j);
1352 
1353           errorSOS += (m_mergingDist[j]-yCap)*(m_mergingDist[j]-yCap);
1354 
1355      }
1356 
1357      outRRMSE=sqrt(errorSOS/(maxNumClusters-candidateKnee-1));
1358 
1359      LOG(LTKLogger::LTK_LOGLEVEL_DEBUG)<<"Exiting: "
1360           <<"LTKHierarchicalClustering::findRMSE()"<<endl;
1361 
1362 }
1363 
1364 
1365 
1366 
1367 /**********************************************************************************
1368 * AUTHOR       : Bharath A
1369 * DATE              : 22-FEB-2005
1370 * NAME              : computeAvgSil
1371 * DESCRIPTION  : Funtion that determines the change in the ratio of
1372 *                     intra and inter cluster similarity before and after merging
1373 * ARGUMENTS         : clust1Index - index of cluster one
1374 *                     clust2Index - index of cluster two
1375 * RETURNS      : average silhouette computed
1376 * NOTES             :
1377 * CHANGE HISTROY
1378 * Author            Date                Description of change
1379 *************************************************************************************/
computeAvgSil(int clust1Index,int clust2Index)1380 float computeAvgSil(int clust1Index,int clust2Index) const
1381 {
1382      LOG(LTKLogger::LTK_LOGLEVEL_DEBUG)<<"Entering: "
1383           <<"LTKHierarchicalClustering::computeAvgSil()"<<endl;
1384 
1385      const vector<int>& clust1=m_intermediateCG[clust1Index];
1386 
1387      const vector<int>& clust2=m_intermediateCG[clust2Index];
1388 
1389      vector<int> combinedClust;
1390 
1391      combinedClust.insert(combinedClust.end(),clust1.begin(),clust1.end());
1392 
1393      combinedClust.insert(combinedClust.end(),clust2.begin(),clust2.end());
1394 
1395      float clust1TotalSils=0.0f,clust2TotalSils=0.0f,combinedClustTotalSils=0.0f;
1396 
1397      for(int i=0;i<clust1.size();i++)
1398      {
1399           int dataObj=clust1[i];
1400 
1401           float avgIntraDist=0.0;
1402 
1403           if(clust1.size()>1)
1404           {
1405                for(int j=0;j<clust1.size();j++)
1406                {
1407                     if(clust1[j]!=dataObj)
1408                     {
1409                          avgIntraDist+=getInterObjectDistance(dataObj,clust1[j]);
1410 
1411                     }
1412                }
1413 
1414                avgIntraDist/=((float)(clust1.size()-1));
1415           }
1416 
1417           float minInterDist= FLT_MAX;
1418 
1419           for(int r=0;r<m_intermediateCG.size();r++)
1420           {
1421                float avgInterDist=0.0;
1422 
1423                if(r!=clust1Index)
1424                {
1425                     for(int c=0;c<m_intermediateCG[r].size();c++)
1426                     {
1427                          avgInterDist+=getInterObjectDistance(dataObj,m_intermediateCG[r][c]);
1428                     }
1429 
1430                     avgInterDist/=((float)m_intermediateCG[r].size());
1431 
1432                     if(avgInterDist<minInterDist)
1433                     {
1434                          minInterDist=avgInterDist;
1435                     }
1436                }
1437 
1438           }
1439 
1440           float dataObjSil=0.0;
1441 
1442           if(minInterDist > avgIntraDist && minInterDist > EPS)
1443           {
1444 
1445                dataObjSil=(minInterDist-avgIntraDist)/minInterDist;
1446 
1447           }
1448           else if(avgIntraDist > EPS)
1449           {
1450 
1451                dataObjSil=(minInterDist-avgIntraDist)/avgIntraDist;
1452 
1453           }
1454 
1455           clust1TotalSils+=dataObjSil;
1456      }
1457 
1458      for(int ii=0;ii<clust2.size();ii++)
1459      {
1460           int dataObj=clust2[ii];
1461 
1462           float avgIntraDist=0.0;
1463 
1464           if(clust2.size()>1)
1465           {
1466                for(int j=0;j<clust2.size();j++)
1467                {
1468                     if(clust2[j]!=dataObj)
1469                     {
1470 
1471                          avgIntraDist+=getInterObjectDistance(dataObj,clust2[j]);
1472 
1473                     }
1474                }
1475 
1476                avgIntraDist/=((float)(clust2.size()-1));
1477           }
1478 
1479           float minInterDist= FLT_MAX;
1480 
1481           for(int r=0;r<m_intermediateCG.size();r++)
1482           {
1483                float avgInterDist=0.0;
1484 
1485                if(r!=clust2Index)
1486                {
1487                     for(int c=0;c<m_intermediateCG[r].size();c++)
1488                     {
1489                          avgInterDist+=getInterObjectDistance(dataObj,
1490                                                                        m_intermediateCG[r][c]);
1491                     }
1492 
1493                     avgInterDist/=((float)m_intermediateCG[r].size());
1494 
1495                     if(avgInterDist < minInterDist)
1496                     {
1497                          minInterDist=avgInterDist;
1498                     }
1499                }
1500 
1501           }
1502           float dataObjSil=0.0;
1503 
1504           if(minInterDist > avgIntraDist && minInterDist > EPS)
1505           {
1506                dataObjSil=(minInterDist-avgIntraDist)/minInterDist;
1507           }
1508           else if(avgIntraDist > EPS)
1509           {
1510                dataObjSil=(minInterDist-avgIntraDist)/avgIntraDist;
1511           }
1512 
1513           clust2TotalSils+=dataObjSil;
1514      }
1515 
1516 
1517      for(int iii=0;iii<combinedClust.size();iii++)
1518      {
1519           int dataObj=combinedClust[iii];
1520 
1521           float avgIntraDist=0.0;
1522 
1523           if(combinedClust.size()>1)
1524           {
1525                for(int j=0;j<combinedClust.size();j++)
1526                {
1527                     if(combinedClust[j]!=dataObj)
1528                     {
1529                          avgIntraDist+=getInterObjectDistance(dataObj,combinedClust[j]);
1530 
1531                     }
1532                }
1533                avgIntraDist/=((float)(combinedClust.size()-1));
1534           }
1535 
1536           float minInterDist=FLT_MAX;
1537 
1538           for(int r=0;r<m_intermediateCG.size();r++)
1539           {
1540 
1541                if(r!=clust1Index && r!=clust2Index)
1542                {
1543                     float avgInterDist=0.0f;
1544 
1545                     for(int c=0;c<m_intermediateCG[r].size();c++)
1546                     {
1547                          avgInterDist+=getInterObjectDistance(dataObj,m_intermediateCG[r][c]);
1548                     }
1549 
1550                     avgInterDist = (float)avgInterDist/ ((float)m_intermediateCG[r].size());
1551 
1552                     if(avgInterDist<minInterDist)
1553                     {
1554                          minInterDist=avgInterDist;
1555 
1556                     }
1557                }
1558 
1559           }
1560 
1561           float dataObjSil=0.0;
1562 
1563           if(minInterDist>avgIntraDist && minInterDist > EPS)
1564           {
1565 
1566                dataObjSil=(minInterDist-avgIntraDist)/minInterDist;
1567 
1568           }
1569           else if(avgIntraDist > EPS)
1570           {
1571 
1572                dataObjSil=(minInterDist-avgIntraDist)/avgIntraDist;
1573 
1574           }
1575 
1576           combinedClustTotalSils+=dataObjSil;
1577      }
1578 
1579 
1580      LOG(LTKLogger::LTK_LOGLEVEL_DEBUG)<<"Exiting: "
1581           <<"LTKHierarchicalClustering::computeAvgSil()"<<endl;
1582 
1583 
1584      return (combinedClustTotalSils-clust1TotalSils-clust2TotalSils);
1585 
1586 }
1587 
1588 
1589 
1590 };
1591 #endif
1592 
1593