1 /*
2  * This file is part of ELKI:
3  * Environment for Developing KDD-Applications Supported by Index-Structures
4  *
5  * Copyright (C) 2018
6  * ELKI Development Team
7  *
8  * This program is free software: you can redistribute it and/or modify
9  * it under the terms of the GNU Affero General Public License as published by
10  * the Free Software Foundation, either version 3 of the License, or
11  * (at your option) any later version.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  * GNU Affero General Public License for more details.
17  *
18  * You should have received a copy of the GNU Affero General Public License
19  * along with this program. If not, see <http://www.gnu.org/licenses/>.
20  */
21 package de.lmu.ifi.dbs.elki.algorithm.outlier.distance;
22 
23 import de.lmu.ifi.dbs.elki.database.Database;
24 import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory;
25 import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil;
26 import de.lmu.ifi.dbs.elki.database.datastore.DoubleDataStore;
27 import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore;
28 import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
29 import de.lmu.ifi.dbs.elki.database.ids.DoubleDBIDList;
30 import de.lmu.ifi.dbs.elki.database.ids.KNNList;
31 import de.lmu.ifi.dbs.elki.database.query.DatabaseQuery;
32 import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery;
33 import de.lmu.ifi.dbs.elki.database.query.knn.KNNQuery;
34 import de.lmu.ifi.dbs.elki.database.query.range.RangeQuery;
35 import de.lmu.ifi.dbs.elki.database.relation.Relation;
36 import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction;
37 import de.lmu.ifi.dbs.elki.logging.Logging;
38 import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress;
39 import de.lmu.ifi.dbs.elki.utilities.Alias;
40 import de.lmu.ifi.dbs.elki.utilities.documentation.Description;
41 import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
42 import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
43 import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
44 import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
45 import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
46 import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter;
47 
48 /**
49  * Simple distanced based outlier detection algorithm. User has to specify two
50  * parameters An object is flagged as an outlier if at least a fraction p of all
51  * data objects has a distance above d from c.
52  * <p>
53  * Reference:
54  * <p>
55  * E.M. Knorr, R. T. Ng:<br>
56  * Algorithms for Mining Distance-Based Outliers in Large Datasets,<br>
57  * In: Proc. Int. Conf. on Very Large Databases (VLDB'98)
58  * <p>
59  * This paper presents several Distance Based Outlier Detection algorithms.
60  * Implemented here is a simple index based algorithm as presented in section
61  * 3.1.
62  *
63  * @author Lisa Reichert
64  * @since 0.3
65  *
66  * @has - - - KNNQuery
67  *
68  * @param <O> the type of objects handled by this algorithm
69  */
70 @Title("DBOD: Distance Based Outlier Detection")
71 @Description("If the D-neighborhood of an object contains only very few objects (less than (1-p) percent of the data) this object is flagged as an outlier")
72 @Reference(authors = "E. M. Knorr, R. T. Ng", //
73     title = "Algorithms for Mining Distance-Based Outliers in Large Datasets", //
74     booktitle = "Proc. Int. Conf. on Very Large Databases (VLDB'98)", //
75     url = "http://www.vldb.org/conf/1998/p392.pdf", //
76     bibkey = "DBLP:conf/vldb/KnorrN98")
77 @Alias({ "de.lmu.ifi.dbs.elki.algorithm.outlier.DBOutlierDetection" })
78 public class DBOutlierDetection<O> extends AbstractDBOutlier<O> {
79   /**
80    * The logger for this class.
81    */
82   private static final Logging LOG = Logging.getLogger(DBOutlierDetection.class);
83 
84   /**
85    * Density threshold percentage p.
86    */
87   private double p;
88 
89   /**
90    * Constructor with actual parameters.
91    *
92    * @param distanceFunction distance function parameter
93    * @param d distance query radius
94    * @param p percentage parameter
95    */
DBOutlierDetection(DistanceFunction<? super O> distanceFunction, double d, double p)96   public DBOutlierDetection(DistanceFunction<? super O> distanceFunction, double d, double p) {
97     super(distanceFunction, d);
98     this.p = p;
99   }
100 
101   @Override
computeOutlierScores(Database database, Relation<O> relation, double d)102   protected DoubleDataStore computeOutlierScores(Database database, Relation<O> relation, double d) {
103     DistanceQuery<O> distFunc = database.getDistanceQuery(relation, getDistanceFunction());
104     // Prefer kNN query if available, as this will usually stop earlier.
105     KNNQuery<O> knnQuery = database.getKNNQuery(distFunc, DatabaseQuery.HINT_OPTIMIZED_ONLY);
106     RangeQuery<O> rangeQuery = knnQuery == null ? database.getRangeQuery(distFunc, DatabaseQuery.HINT_OPTIMIZED_ONLY, d) : null;
107 
108     // maximum number of objects in the D-neighborhood of an outlier
109     int m = (int) Math.floor((distFunc.getRelation().size()) * (1 - p));
110 
111     WritableDoubleDataStore scores = DataStoreUtil.makeDoubleStorage(distFunc.getRelation().getDBIDs(), DataStoreFactory.HINT_STATIC);
112 
113     FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("DBOutlier detection", distFunc.getRelation().size(), LOG) : null;
114     // if index exists, kNN query. if the distance to the mth nearest neighbor
115     // is more than d -> object is outlier
116     if(knnQuery != null) {
117       if(LOG.isVeryVerbose()) {
118         LOG.veryverbose("Using kNN query: " + knnQuery.toString());
119       }
120       for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
121         KNNList knns = knnQuery.getKNNForDBID(iditer, m);
122         scores.putDouble(iditer, (knns.getKNNDistance() > d) ? 1. : 0.);
123         LOG.incrementProcessed(prog);
124       }
125     }
126     else if(rangeQuery != null) {
127       if(LOG.isVeryVerbose()) {
128         LOG.veryverbose("Using range query: " + rangeQuery.toString());
129       }
130       for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
131         DoubleDBIDList neighbors = rangeQuery.getRangeForDBID(iditer, d);
132         scores.putDouble(iditer, (neighbors.size() < m) ? 1. : 0.);
133         LOG.incrementProcessed(prog);
134       }
135     }
136     else {
137       // Linear scan neighbors for each object, but stop early.
138       for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
139         int count = 0;
140         for(DBIDIter iterator = relation.iterDBIDs(); iterator.valid(); iterator.advance()) {
141           double currentDistance = distFunc.distance(iditer, iterator);
142           if(currentDistance <= d) {
143             if(++count >= m) {
144               break;
145             }
146           }
147         }
148         scores.putDouble(iditer, (count < m) ? 1.0 : 0);
149         LOG.incrementProcessed(prog);
150       }
151     }
152     LOG.ensureCompleted(prog);
153     return scores;
154   }
155 
156   @Override
getLogger()157   protected Logging getLogger() {
158     return LOG;
159   }
160 
161   /**
162    * Parameterization class.
163    *
164    * @author Erich Schubert
165    */
166   public static class Parameterizer<O> extends AbstractDBOutlier.Parameterizer<O> {
167     /**
168      * Parameter to specify the minimum fraction of objects that must be outside
169      * the D- neighborhood of an outlier
170      */
171     public static final OptionID P_ID = new OptionID("dbod.p", "minimum fraction of objects that must be outside the D-neighborhood of an outlier");
172 
173     /**
174      * Density threshold p.
175      */
176     protected double p = 0.0;
177 
178     @Override
makeOptions(Parameterization config)179     protected void makeOptions(Parameterization config) {
180       super.makeOptions(config);
181       final DoubleParameter pP = new DoubleParameter(P_ID) //
182           .addConstraint(CommonConstraints.GREATER_THAN_ZERO_DOUBLE) //
183           .addConstraint(CommonConstraints.LESS_THAN_ONE_DOUBLE);
184       if(config.grab(pP)) {
185         p = pP.getValue();
186       }
187     }
188 
189     @Override
makeInstance()190     protected DBOutlierDetection<O> makeInstance() {
191       return new DBOutlierDetection<>(distanceFunction, d, p);
192     }
193   }
194 }
195