1 /* 2 * Licensed to the Apache Software Foundation (ASF) under one or more 3 * contributor license agreements. See the NOTICE file distributed with 4 * this work for additional information regarding copyright ownership. 5 * The ASF licenses this file to You under the Apache License, Version 2.0 6 * (the "License"); you may not use this file except in compliance with 7 * the License. You may obtain a copy of the License at 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 */ 17 package org.apache.commons.math3.stat.clustering; 18 19 import java.util.ArrayList; 20 import java.util.Collection; 21 import java.util.HashMap; 22 import java.util.HashSet; 23 import java.util.List; 24 import java.util.Map; 25 import java.util.Set; 26 27 import org.apache.commons.math3.exception.NotPositiveException; 28 import org.apache.commons.math3.exception.NullArgumentException; 29 import org.apache.commons.math3.util.MathUtils; 30 31 /** 32 * DBSCAN (density-based spatial clustering of applications with noise) algorithm. 33 * <p> 34 * The DBSCAN algorithm forms clusters based on the idea of density connectivity, i.e. 35 * a point p is density connected to another point q, if there exists a chain of 36 * points p<sub>i</sub>, with i = 1 .. n and p<sub>1</sub> = p and p<sub>n</sub> = q, 37 * such that each pair <p<sub>i</sub>, p<sub>i+1</sub>> is directly density-reachable. 38 * A point q is directly density-reachable from point p if it is in the ε-neighborhood 39 * of this point. 40 * <p> 41 * Any point that is not density-reachable from a formed cluster is treated as noise, and 42 * will thus not be present in the result. 43 * <p> 44 * The algorithm requires two parameters: 45 * <ul> 46 * <li>eps: the distance that defines the ε-neighborhood of a point 47 * <li>minPoints: the minimum number of density-connected points required to form a cluster 48 * </ul> 49 * <p> 50 * <b>Note:</b> as DBSCAN is not a centroid-based clustering algorithm, the resulting 51 * {@link Cluster} objects will have no defined center, i.e. {@link Cluster#getCenter()} will 52 * return {@code null}. 53 * 54 * @param <T> type of the points to cluster 55 * @see <a href="http://en.wikipedia.org/wiki/DBSCAN">DBSCAN (wikipedia)</a> 56 * @see <a href="http://www.dbs.ifi.lmu.de/Publikationen/Papers/KDD-96.final.frame.pdf"> 57 * A Density-Based Algorithm for Discovering Clusters in Large Spatial Databases with Noise</a> 58 * @since 3.1 59 * @deprecated As of 3.2 (to be removed in 4.0), 60 * use {@link org.apache.commons.math3.ml.clustering.DBSCANClusterer} instead 61 */ 62 @Deprecated 63 public class DBSCANClusterer<T extends Clusterable<T>> { 64 65 /** Maximum radius of the neighborhood to be considered. */ 66 private final double eps; 67 68 /** Minimum number of points needed for a cluster. */ 69 private final int minPts; 70 71 /** Status of a point during the clustering process. */ 72 private enum PointStatus { 73 /** The point has is considered to be noise. */ 74 NOISE, 75 /** The point is already part of a cluster. */ 76 PART_OF_CLUSTER 77 } 78 79 /** 80 * Creates a new instance of a DBSCANClusterer. 81 * 82 * @param eps maximum radius of the neighborhood to be considered 83 * @param minPts minimum number of points needed for a cluster 84 * @throws NotPositiveException if {@code eps < 0.0} or {@code minPts < 0} 85 */ DBSCANClusterer(final double eps, final int minPts)86 public DBSCANClusterer(final double eps, final int minPts) 87 throws NotPositiveException { 88 if (eps < 0.0d) { 89 throw new NotPositiveException(eps); 90 } 91 if (minPts < 0) { 92 throw new NotPositiveException(minPts); 93 } 94 this.eps = eps; 95 this.minPts = minPts; 96 } 97 98 /** 99 * Returns the maximum radius of the neighborhood to be considered. 100 * 101 * @return maximum radius of the neighborhood 102 */ getEps()103 public double getEps() { 104 return eps; 105 } 106 107 /** 108 * Returns the minimum number of points needed for a cluster. 109 * 110 * @return minimum number of points needed for a cluster 111 */ getMinPts()112 public int getMinPts() { 113 return minPts; 114 } 115 116 /** 117 * Performs DBSCAN cluster analysis. 118 * <p> 119 * <b>Note:</b> as DBSCAN is not a centroid-based clustering algorithm, the resulting 120 * {@link Cluster} objects will have no defined center, i.e. {@link Cluster#getCenter()} will 121 * return {@code null}. 122 * 123 * @param points the points to cluster 124 * @return the list of clusters 125 * @throws NullArgumentException if the data points are null 126 */ cluster(final Collection<T> points)127 public List<Cluster<T>> cluster(final Collection<T> points) throws NullArgumentException { 128 129 // sanity checks 130 MathUtils.checkNotNull(points); 131 132 final List<Cluster<T>> clusters = new ArrayList<Cluster<T>>(); 133 final Map<Clusterable<T>, PointStatus> visited = new HashMap<Clusterable<T>, PointStatus>(); 134 135 for (final T point : points) { 136 if (visited.get(point) != null) { 137 continue; 138 } 139 final List<T> neighbors = getNeighbors(point, points); 140 if (neighbors.size() >= minPts) { 141 // DBSCAN does not care about center points 142 final Cluster<T> cluster = new Cluster<T>(null); 143 clusters.add(expandCluster(cluster, point, neighbors, points, visited)); 144 } else { 145 visited.put(point, PointStatus.NOISE); 146 } 147 } 148 149 return clusters; 150 } 151 152 /** 153 * Expands the cluster to include density-reachable items. 154 * 155 * @param cluster Cluster to expand 156 * @param point Point to add to cluster 157 * @param neighbors List of neighbors 158 * @param points the data set 159 * @param visited the set of already visited points 160 * @return the expanded cluster 161 */ expandCluster(final Cluster<T> cluster, final T point, final List<T> neighbors, final Collection<T> points, final Map<Clusterable<T>, PointStatus> visited)162 private Cluster<T> expandCluster(final Cluster<T> cluster, 163 final T point, 164 final List<T> neighbors, 165 final Collection<T> points, 166 final Map<Clusterable<T>, PointStatus> visited) { 167 cluster.addPoint(point); 168 visited.put(point, PointStatus.PART_OF_CLUSTER); 169 170 List<T> seeds = new ArrayList<T>(neighbors); 171 int index = 0; 172 while (index < seeds.size()) { 173 final T current = seeds.get(index); 174 PointStatus pStatus = visited.get(current); 175 // only check non-visited points 176 if (pStatus == null) { 177 final List<T> currentNeighbors = getNeighbors(current, points); 178 if (currentNeighbors.size() >= minPts) { 179 seeds = merge(seeds, currentNeighbors); 180 } 181 } 182 183 if (pStatus != PointStatus.PART_OF_CLUSTER) { 184 visited.put(current, PointStatus.PART_OF_CLUSTER); 185 cluster.addPoint(current); 186 } 187 188 index++; 189 } 190 return cluster; 191 } 192 193 /** 194 * Returns a list of density-reachable neighbors of a {@code point}. 195 * 196 * @param point the point to look for 197 * @param points possible neighbors 198 * @return the List of neighbors 199 */ getNeighbors(final T point, final Collection<T> points)200 private List<T> getNeighbors(final T point, final Collection<T> points) { 201 final List<T> neighbors = new ArrayList<T>(); 202 for (final T neighbor : points) { 203 if (point != neighbor && neighbor.distanceFrom(point) <= eps) { 204 neighbors.add(neighbor); 205 } 206 } 207 return neighbors; 208 } 209 210 /** 211 * Merges two lists together. 212 * 213 * @param one first list 214 * @param two second list 215 * @return merged lists 216 */ merge(final List<T> one, final List<T> two)217 private List<T> merge(final List<T> one, final List<T> two) { 218 final Set<T> oneSet = new HashSet<T>(one); 219 for (T item : two) { 220 if (!oneSet.contains(item)) { 221 one.add(item); 222 } 223 } 224 return one; 225 } 226 } 227