1 /* 2 * This file is part of ELKI: 3 * Environment for Developing KDD-Applications Supported by Index-Structures 4 * 5 * Copyright (C) 2018 6 * ELKI Development Team 7 * 8 * This program is free software: you can redistribute it and/or modify 9 * it under the terms of the GNU Affero General Public License as published by 10 * the Free Software Foundation, either version 3 of the License, or 11 * (at your option) any later version. 12 * 13 * This program is distributed in the hope that it will be useful, 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 * GNU Affero General Public License for more details. 17 * 18 * You should have received a copy of the GNU Affero General Public License 19 * along with this program. If not, see <http://www.gnu.org/licenses/>. 20 */ 21 package de.lmu.ifi.dbs.elki.algorithm.outlier.distance; 22 23 import de.lmu.ifi.dbs.elki.database.Database; 24 import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; 25 import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; 26 import de.lmu.ifi.dbs.elki.database.datastore.DoubleDataStore; 27 import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore; 28 import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; 29 import de.lmu.ifi.dbs.elki.database.ids.DoubleDBIDList; 30 import de.lmu.ifi.dbs.elki.database.ids.KNNList; 31 import de.lmu.ifi.dbs.elki.database.query.DatabaseQuery; 32 import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; 33 import de.lmu.ifi.dbs.elki.database.query.knn.KNNQuery; 34 import de.lmu.ifi.dbs.elki.database.query.range.RangeQuery; 35 import de.lmu.ifi.dbs.elki.database.relation.Relation; 36 import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction; 37 import de.lmu.ifi.dbs.elki.logging.Logging; 38 import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress; 39 import de.lmu.ifi.dbs.elki.utilities.Alias; 40 import de.lmu.ifi.dbs.elki.utilities.documentation.Description; 41 import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; 42 import de.lmu.ifi.dbs.elki.utilities.documentation.Title; 43 import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; 44 import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints; 45 import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; 46 import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter; 47 48 /** 49 * Simple distanced based outlier detection algorithm. User has to specify two 50 * parameters An object is flagged as an outlier if at least a fraction p of all 51 * data objects has a distance above d from c. 52 * <p> 53 * Reference: 54 * <p> 55 * E.M. Knorr, R. T. Ng:<br> 56 * Algorithms for Mining Distance-Based Outliers in Large Datasets,<br> 57 * In: Proc. Int. Conf. on Very Large Databases (VLDB'98) 58 * <p> 59 * This paper presents several Distance Based Outlier Detection algorithms. 60 * Implemented here is a simple index based algorithm as presented in section 61 * 3.1. 62 * 63 * @author Lisa Reichert 64 * @since 0.3 65 * 66 * @has - - - KNNQuery 67 * 68 * @param <O> the type of objects handled by this algorithm 69 */ 70 @Title("DBOD: Distance Based Outlier Detection") 71 @Description("If the D-neighborhood of an object contains only very few objects (less than (1-p) percent of the data) this object is flagged as an outlier") 72 @Reference(authors = "E. M. Knorr, R. T. Ng", // 73 title = "Algorithms for Mining Distance-Based Outliers in Large Datasets", // 74 booktitle = "Proc. Int. Conf. on Very Large Databases (VLDB'98)", // 75 url = "http://www.vldb.org/conf/1998/p392.pdf", // 76 bibkey = "DBLP:conf/vldb/KnorrN98") 77 @Alias({ "de.lmu.ifi.dbs.elki.algorithm.outlier.DBOutlierDetection" }) 78 public class DBOutlierDetection<O> extends AbstractDBOutlier<O> { 79 /** 80 * The logger for this class. 81 */ 82 private static final Logging LOG = Logging.getLogger(DBOutlierDetection.class); 83 84 /** 85 * Density threshold percentage p. 86 */ 87 private double p; 88 89 /** 90 * Constructor with actual parameters. 91 * 92 * @param distanceFunction distance function parameter 93 * @param d distance query radius 94 * @param p percentage parameter 95 */ DBOutlierDetection(DistanceFunction<? super O> distanceFunction, double d, double p)96 public DBOutlierDetection(DistanceFunction<? super O> distanceFunction, double d, double p) { 97 super(distanceFunction, d); 98 this.p = p; 99 } 100 101 @Override computeOutlierScores(Database database, Relation<O> relation, double d)102 protected DoubleDataStore computeOutlierScores(Database database, Relation<O> relation, double d) { 103 DistanceQuery<O> distFunc = database.getDistanceQuery(relation, getDistanceFunction()); 104 // Prefer kNN query if available, as this will usually stop earlier. 105 KNNQuery<O> knnQuery = database.getKNNQuery(distFunc, DatabaseQuery.HINT_OPTIMIZED_ONLY); 106 RangeQuery<O> rangeQuery = knnQuery == null ? database.getRangeQuery(distFunc, DatabaseQuery.HINT_OPTIMIZED_ONLY, d) : null; 107 108 // maximum number of objects in the D-neighborhood of an outlier 109 int m = (int) Math.floor((distFunc.getRelation().size()) * (1 - p)); 110 111 WritableDoubleDataStore scores = DataStoreUtil.makeDoubleStorage(distFunc.getRelation().getDBIDs(), DataStoreFactory.HINT_STATIC); 112 113 FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("DBOutlier detection", distFunc.getRelation().size(), LOG) : null; 114 // if index exists, kNN query. if the distance to the mth nearest neighbor 115 // is more than d -> object is outlier 116 if(knnQuery != null) { 117 if(LOG.isVeryVerbose()) { 118 LOG.veryverbose("Using kNN query: " + knnQuery.toString()); 119 } 120 for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { 121 KNNList knns = knnQuery.getKNNForDBID(iditer, m); 122 scores.putDouble(iditer, (knns.getKNNDistance() > d) ? 1. : 0.); 123 LOG.incrementProcessed(prog); 124 } 125 } 126 else if(rangeQuery != null) { 127 if(LOG.isVeryVerbose()) { 128 LOG.veryverbose("Using range query: " + rangeQuery.toString()); 129 } 130 for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { 131 DoubleDBIDList neighbors = rangeQuery.getRangeForDBID(iditer, d); 132 scores.putDouble(iditer, (neighbors.size() < m) ? 1. : 0.); 133 LOG.incrementProcessed(prog); 134 } 135 } 136 else { 137 // Linear scan neighbors for each object, but stop early. 138 for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { 139 int count = 0; 140 for(DBIDIter iterator = relation.iterDBIDs(); iterator.valid(); iterator.advance()) { 141 double currentDistance = distFunc.distance(iditer, iterator); 142 if(currentDistance <= d) { 143 if(++count >= m) { 144 break; 145 } 146 } 147 } 148 scores.putDouble(iditer, (count < m) ? 1.0 : 0); 149 LOG.incrementProcessed(prog); 150 } 151 } 152 LOG.ensureCompleted(prog); 153 return scores; 154 } 155 156 @Override getLogger()157 protected Logging getLogger() { 158 return LOG; 159 } 160 161 /** 162 * Parameterization class. 163 * 164 * @author Erich Schubert 165 */ 166 public static class Parameterizer<O> extends AbstractDBOutlier.Parameterizer<O> { 167 /** 168 * Parameter to specify the minimum fraction of objects that must be outside 169 * the D- neighborhood of an outlier 170 */ 171 public static final OptionID P_ID = new OptionID("dbod.p", "minimum fraction of objects that must be outside the D-neighborhood of an outlier"); 172 173 /** 174 * Density threshold p. 175 */ 176 protected double p = 0.0; 177 178 @Override makeOptions(Parameterization config)179 protected void makeOptions(Parameterization config) { 180 super.makeOptions(config); 181 final DoubleParameter pP = new DoubleParameter(P_ID) // 182 .addConstraint(CommonConstraints.GREATER_THAN_ZERO_DOUBLE) // 183 .addConstraint(CommonConstraints.LESS_THAN_ONE_DOUBLE); 184 if(config.grab(pP)) { 185 p = pP.getValue(); 186 } 187 } 188 189 @Override makeInstance()190 protected DBOutlierDetection<O> makeInstance() { 191 return new DBOutlierDetection<>(distanceFunction, d, p); 192 } 193 } 194 } 195