1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one or more
3  * contributor license agreements.  See the NOTICE file distributed with
4  * this work for additional information regarding copyright ownership.
5  * The ASF licenses this file to You under the Apache License, Version 2.0
6  * (the "License"); you may not use this file except in compliance with
7  * the License.  You may obtain a copy of the License at
8  *
9  *      http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  */
17 package org.apache.commons.math3.stat.clustering;
18 
19 import java.util.ArrayList;
20 import java.util.Collection;
21 import java.util.HashMap;
22 import java.util.HashSet;
23 import java.util.List;
24 import java.util.Map;
25 import java.util.Set;
26 
27 import org.apache.commons.math3.exception.NotPositiveException;
28 import org.apache.commons.math3.exception.NullArgumentException;
29 import org.apache.commons.math3.util.MathUtils;
30 
31 /**
32  * DBSCAN (density-based spatial clustering of applications with noise) algorithm.
33  * <p>
34  * The DBSCAN algorithm forms clusters based on the idea of density connectivity, i.e.
35  * a point p is density connected to another point q, if there exists a chain of
36  * points p<sub>i</sub>, with i = 1 .. n and p<sub>1</sub> = p and p<sub>n</sub> = q,
37  * such that each pair &lt;p<sub>i</sub>, p<sub>i+1</sub>&gt; is directly density-reachable.
38  * A point q is directly density-reachable from point p if it is in the &epsilon;-neighborhood
39  * of this point.
40  * <p>
41  * Any point that is not density-reachable from a formed cluster is treated as noise, and
42  * will thus not be present in the result.
43  * <p>
44  * The algorithm requires two parameters:
45  * <ul>
46  *   <li>eps: the distance that defines the &epsilon;-neighborhood of a point
47  *   <li>minPoints: the minimum number of density-connected points required to form a cluster
48  * </ul>
49  * <p>
50  * <b>Note:</b> as DBSCAN is not a centroid-based clustering algorithm, the resulting
51  * {@link Cluster} objects will have no defined center, i.e. {@link Cluster#getCenter()} will
52  * return {@code null}.
53  *
54  * @param <T> type of the points to cluster
55  * @see <a href="http://en.wikipedia.org/wiki/DBSCAN">DBSCAN (wikipedia)</a>
56  * @see <a href="http://www.dbs.ifi.lmu.de/Publikationen/Papers/KDD-96.final.frame.pdf">
57  * A Density-Based Algorithm for Discovering Clusters in Large Spatial Databases with Noise</a>
58  * @since 3.1
59  * @deprecated As of 3.2 (to be removed in 4.0),
60  * use {@link org.apache.commons.math3.ml.clustering.DBSCANClusterer} instead
61  */
62 @Deprecated
63 public class DBSCANClusterer<T extends Clusterable<T>> {
64 
65     /** Maximum radius of the neighborhood to be considered. */
66     private final double              eps;
67 
68     /** Minimum number of points needed for a cluster. */
69     private final int                 minPts;
70 
71     /** Status of a point during the clustering process. */
72     private enum PointStatus {
73         /** The point has is considered to be noise. */
74         NOISE,
75         /** The point is already part of a cluster. */
76         PART_OF_CLUSTER
77     }
78 
79     /**
80      * Creates a new instance of a DBSCANClusterer.
81      *
82      * @param eps maximum radius of the neighborhood to be considered
83      * @param minPts minimum number of points needed for a cluster
84      * @throws NotPositiveException if {@code eps < 0.0} or {@code minPts < 0}
85      */
DBSCANClusterer(final double eps, final int minPts)86     public DBSCANClusterer(final double eps, final int minPts)
87         throws NotPositiveException {
88         if (eps < 0.0d) {
89             throw new NotPositiveException(eps);
90         }
91         if (minPts < 0) {
92             throw new NotPositiveException(minPts);
93         }
94         this.eps = eps;
95         this.minPts = minPts;
96     }
97 
98     /**
99      * Returns the maximum radius of the neighborhood to be considered.
100      *
101      * @return maximum radius of the neighborhood
102      */
getEps()103     public double getEps() {
104         return eps;
105     }
106 
107     /**
108      * Returns the minimum number of points needed for a cluster.
109      *
110      * @return minimum number of points needed for a cluster
111      */
getMinPts()112     public int getMinPts() {
113         return minPts;
114     }
115 
116     /**
117      * Performs DBSCAN cluster analysis.
118      * <p>
119      * <b>Note:</b> as DBSCAN is not a centroid-based clustering algorithm, the resulting
120      * {@link Cluster} objects will have no defined center, i.e. {@link Cluster#getCenter()} will
121      * return {@code null}.
122      *
123      * @param points the points to cluster
124      * @return the list of clusters
125      * @throws NullArgumentException if the data points are null
126      */
cluster(final Collection<T> points)127     public List<Cluster<T>> cluster(final Collection<T> points) throws NullArgumentException {
128 
129         // sanity checks
130         MathUtils.checkNotNull(points);
131 
132         final List<Cluster<T>> clusters = new ArrayList<Cluster<T>>();
133         final Map<Clusterable<T>, PointStatus> visited = new HashMap<Clusterable<T>, PointStatus>();
134 
135         for (final T point : points) {
136             if (visited.get(point) != null) {
137                 continue;
138             }
139             final List<T> neighbors = getNeighbors(point, points);
140             if (neighbors.size() >= minPts) {
141                 // DBSCAN does not care about center points
142                 final Cluster<T> cluster = new Cluster<T>(null);
143                 clusters.add(expandCluster(cluster, point, neighbors, points, visited));
144             } else {
145                 visited.put(point, PointStatus.NOISE);
146             }
147         }
148 
149         return clusters;
150     }
151 
152     /**
153      * Expands the cluster to include density-reachable items.
154      *
155      * @param cluster Cluster to expand
156      * @param point Point to add to cluster
157      * @param neighbors List of neighbors
158      * @param points the data set
159      * @param visited the set of already visited points
160      * @return the expanded cluster
161      */
expandCluster(final Cluster<T> cluster, final T point, final List<T> neighbors, final Collection<T> points, final Map<Clusterable<T>, PointStatus> visited)162     private Cluster<T> expandCluster(final Cluster<T> cluster,
163                                      final T point,
164                                      final List<T> neighbors,
165                                      final Collection<T> points,
166                                      final Map<Clusterable<T>, PointStatus> visited) {
167         cluster.addPoint(point);
168         visited.put(point, PointStatus.PART_OF_CLUSTER);
169 
170         List<T> seeds = new ArrayList<T>(neighbors);
171         int index = 0;
172         while (index < seeds.size()) {
173             final T current = seeds.get(index);
174             PointStatus pStatus = visited.get(current);
175             // only check non-visited points
176             if (pStatus == null) {
177                 final List<T> currentNeighbors = getNeighbors(current, points);
178                 if (currentNeighbors.size() >= minPts) {
179                     seeds = merge(seeds, currentNeighbors);
180                 }
181             }
182 
183             if (pStatus != PointStatus.PART_OF_CLUSTER) {
184                 visited.put(current, PointStatus.PART_OF_CLUSTER);
185                 cluster.addPoint(current);
186             }
187 
188             index++;
189         }
190         return cluster;
191     }
192 
193     /**
194      * Returns a list of density-reachable neighbors of a {@code point}.
195      *
196      * @param point the point to look for
197      * @param points possible neighbors
198      * @return the List of neighbors
199      */
getNeighbors(final T point, final Collection<T> points)200     private List<T> getNeighbors(final T point, final Collection<T> points) {
201         final List<T> neighbors = new ArrayList<T>();
202         for (final T neighbor : points) {
203             if (point != neighbor && neighbor.distanceFrom(point) <= eps) {
204                 neighbors.add(neighbor);
205             }
206         }
207         return neighbors;
208     }
209 
210     /**
211      * Merges two lists together.
212      *
213      * @param one first list
214      * @param two second list
215      * @return merged lists
216      */
merge(final List<T> one, final List<T> two)217     private List<T> merge(final List<T> one, final List<T> two) {
218         final Set<T> oneSet = new HashSet<T>(one);
219         for (T item : two) {
220             if (!oneSet.contains(item)) {
221                 one.add(item);
222             }
223         }
224         return one;
225     }
226 }
227