1 /* 2 * This file is part of ELKI: 3 * Environment for Developing KDD-Applications Supported by Index-Structures 4 * 5 * Copyright (C) 2018 6 * ELKI Development Team 7 * 8 * This program is free software: you can redistribute it and/or modify 9 * it under the terms of the GNU Affero General Public License as published by 10 * the Free Software Foundation, either version 3 of the License, or 11 * (at your option) any later version. 12 * 13 * This program is distributed in the hope that it will be useful, 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 * GNU Affero General Public License for more details. 17 * 18 * You should have received a copy of the GNU Affero General Public License 19 * along with this program. If not, see <http://www.gnu.org/licenses/>. 20 */ 21 package de.lmu.ifi.dbs.elki.algorithm.outlier.subspace; 22 23 import java.util.Arrays; 24 25 import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm; 26 import de.lmu.ifi.dbs.elki.algorithm.outlier.OutlierAlgorithm; 27 import de.lmu.ifi.dbs.elki.data.NumberVector; 28 import de.lmu.ifi.dbs.elki.data.type.TypeInformation; 29 import de.lmu.ifi.dbs.elki.data.type.TypeUtil; 30 import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; 31 import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; 32 import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore; 33 import de.lmu.ifi.dbs.elki.database.ids.*; 34 import de.lmu.ifi.dbs.elki.database.relation.MaterializedDoubleRelation; 35 import de.lmu.ifi.dbs.elki.database.relation.Relation; 36 import de.lmu.ifi.dbs.elki.database.relation.RelationUtil; 37 import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; 38 import de.lmu.ifi.dbs.elki.distance.distancefunction.subspace.SubspaceEuclideanDistanceFunction; 39 import de.lmu.ifi.dbs.elki.logging.Logging; 40 import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress; 41 import de.lmu.ifi.dbs.elki.math.DoubleMinMax; 42 import de.lmu.ifi.dbs.elki.math.MathUtil; 43 import de.lmu.ifi.dbs.elki.math.MeanVariance; 44 import de.lmu.ifi.dbs.elki.math.statistics.distribution.GammaDistribution; 45 import de.lmu.ifi.dbs.elki.math.statistics.kernelfunctions.EpanechnikovKernelDensityFunction; 46 import de.lmu.ifi.dbs.elki.math.statistics.kernelfunctions.KernelDensityFunction; 47 import de.lmu.ifi.dbs.elki.result.outlier.InvertedOutlierScoreMeta; 48 import de.lmu.ifi.dbs.elki.result.outlier.OutlierResult; 49 import de.lmu.ifi.dbs.elki.result.outlier.OutlierScoreMeta; 50 import de.lmu.ifi.dbs.elki.utilities.datastructures.BitsUtil; 51 import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; 52 import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; 53 import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; 54 import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; 55 import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter; 56 57 import net.jafama.FastMath; 58 59 /** 60 * Adaptive outlierness for subspace outlier ranking (OUTRES). 61 * <p> 62 * Note: this algorithm seems to have a O(n³d!) complexity with no obvious way 63 * to accelerate it with usual index structures for range queries: each object 64 * in each tested subspace will need to know the mean and standard deviation of 65 * the density of the neighbors, which in turn needs another range query; except 66 * if we precomputed the densities for each of O(d!) possible subsets of 67 * dimensions. 68 * <p> 69 * Reference: 70 * <p> 71 * E. Müller, M. Schiffer, T. Seidl<br> 72 * Adaptive outlierness for subspace outlier ranking<br> 73 * Proc. 19th ACM Int. Conf. on Information and Knowledge Management 74 * 75 * @author Viktoria Pleintinger 76 * @author Erich Schubert 77 * @since 0.5.0 78 * 79 * @composed - - - KernelDensityEstimator 80 */ 81 @Reference(authors = "E. Müller, M. Schiffer, T. Seidl", // 82 title = "Adaptive outlierness for subspace outlier ranking", // 83 booktitle = "Proc. 19th ACM Int. Conf. on Information and Knowledge Management", // 84 url = "https://doi.org/10.1145/1871437.1871690", // 85 bibkey = "DBLP:conf/cikm/MullerSS10") 86 public class OUTRES extends AbstractAlgorithm<OutlierResult> implements OutlierAlgorithm { 87 /** 88 * The logger for this class. 89 */ 90 private static final Logging LOG = Logging.getLogger(OUTRES.class); 91 92 /** 93 * The epsilon (in 2d) parameter 94 */ 95 private final double eps; 96 97 /** 98 * Constant for Kolmogorov-Smirnov at alpha=0.01 (table value) 99 */ 100 private static final double K_S_CRITICAL001 = 1.63; 101 102 /** 103 * Constructor. 104 * 105 * @param eps Epsilon 106 */ OUTRES(double eps)107 public OUTRES(double eps) { 108 super(); 109 this.eps = eps; 110 } 111 112 /** 113 * Main loop for OUTRES 114 * 115 * @param relation Relation to process 116 * @return Outlier detection result 117 */ run(Relation<? extends NumberVector> relation)118 public OutlierResult run(Relation<? extends NumberVector> relation) { 119 final DBIDs ids = relation.getDBIDs(); 120 WritableDoubleDataStore ranks = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_STATIC); 121 DoubleMinMax minmax = new DoubleMinMax(); 122 123 KernelDensityEstimator kernel = new KernelDensityEstimator(relation, eps); 124 long[] subspace = BitsUtil.zero(kernel.dim); 125 126 FiniteProgress progress = LOG.isVerbose() ? new FiniteProgress("OUTRES scores", ids.size(), LOG) : null; 127 128 for(DBIDIter iditer = ids.iter(); iditer.valid(); iditer.advance()) { 129 BitsUtil.zeroI(subspace); 130 double score = outresScore(0, subspace, iditer, kernel, ids); 131 ranks.putDouble(iditer, score); 132 minmax.put(score); 133 LOG.incrementProcessed(progress); 134 } 135 LOG.ensureCompleted(progress); 136 137 OutlierScoreMeta meta = new InvertedOutlierScoreMeta(minmax.getMin(), minmax.getMax(), 0., 1., 1.); 138 return new OutlierResult(meta, new MaterializedDoubleRelation("OUTRES", "outres-score", ranks, ids)); 139 } 140 141 /** 142 * Main loop of OUTRES. Run for each object 143 * 144 * @param s start dimension 145 * @param subspace Current subspace 146 * @param id Current object ID 147 * @param kernel Kernel 148 * @param cands neighbor candidates 149 * @return Score 150 */ outresScore(final int s, long[] subspace, DBIDRef id, KernelDensityEstimator kernel, DBIDs cands)151 public double outresScore(final int s, long[] subspace, DBIDRef id, KernelDensityEstimator kernel, DBIDs cands) { 152 double score = 1.0; // Initial score is 1.0 153 final SubspaceEuclideanDistanceFunction df = new SubspaceEuclideanDistanceFunction(subspace); 154 MeanVariance meanv = new MeanVariance(); 155 ModifiableDoubleDBIDList neighcand = DBIDUtil.newDistanceDBIDList(cands.size()); 156 ModifiableDoubleDBIDList nn = DBIDUtil.newDistanceDBIDList(cands.size()); 157 158 for(int i = s; i < kernel.dim; i++) { 159 assert !BitsUtil.get(subspace, i); 160 BitsUtil.setI(subspace, i); 161 df.setSelectedDimensions(subspace); 162 final double adjustedEps = kernel.adjustedEps(kernel.dim); 163 DoubleDBIDList neigh = initialRange(id, cands, df, adjustedEps * 2, kernel, neighcand); 164 // Relevance test 165 if(neigh.size() > 2) { 166 if(relevantSubspace(subspace, neigh, kernel)) { 167 final double density = kernel.subspaceDensity(subspace, neigh); 168 // Compute mean and standard deviation for densities of neighbors. 169 meanv.reset(); 170 for(DoubleDBIDListIter neighbor = neigh.iter(); neighbor.valid(); neighbor.advance()) { 171 subsetNeighborhoodQuery(neighcand, neighbor, df, adjustedEps, kernel, nn); 172 meanv.put(kernel.subspaceDensity(subspace, nn)); 173 } 174 final double deviation = (meanv.getMean() - density) / (2. * meanv.getSampleStddev()); 175 // High deviation: 176 if(deviation >= 1) { 177 score *= density / deviation; 178 } 179 // Recursion 180 score *= outresScore(i + 1, subspace, id, kernel, neighcand); 181 } 182 } 183 BitsUtil.clearI(subspace, i); 184 } 185 return score; 186 } 187 188 /** 189 * Initial range query. 190 * 191 * @param obj Object 192 * @param cands Candidates 193 * @param df Distance function 194 * @param eps Epsilon radius 195 * @param kernel Kernel 196 * @param n Output buffer 197 * @return Neighbors 198 */ initialRange(DBIDRef obj, DBIDs cands, PrimitiveDistanceFunction<? super NumberVector> df, double eps, KernelDensityEstimator kernel, ModifiableDoubleDBIDList n)199 private DoubleDBIDList initialRange(DBIDRef obj, DBIDs cands, PrimitiveDistanceFunction<? super NumberVector> df, double eps, KernelDensityEstimator kernel, ModifiableDoubleDBIDList n) { 200 n.clear(); 201 NumberVector o = kernel.relation.get(obj); 202 final double twoeps = eps * 2; 203 int matches = 0; 204 for(DBIDIter cand = cands.iter(); cand.valid(); cand.advance()) { 205 final double dist = df.distance(o, kernel.relation.get(cand)); 206 if(dist <= twoeps) { 207 n.add(dist, cand); 208 if(dist <= eps) { 209 ++matches; 210 } 211 } 212 } 213 n.sort(); 214 return n.slice(0, matches); 215 } 216 217 /** 218 * Refine neighbors within a subset. 219 * 220 * @param neighc Neighbor candidates 221 * @param dbid Query object 222 * @param df distance function 223 * @param adjustedEps Epsilon range 224 * @param kernel Kernel 225 * @param n Output list 226 * @return Neighbors of neighbor object 227 */ subsetNeighborhoodQuery(DoubleDBIDList neighc, DBIDRef dbid, PrimitiveDistanceFunction<? super NumberVector> df, double adjustedEps, KernelDensityEstimator kernel, ModifiableDoubleDBIDList n)228 private DoubleDBIDList subsetNeighborhoodQuery(DoubleDBIDList neighc, DBIDRef dbid, PrimitiveDistanceFunction<? super NumberVector> df, double adjustedEps, KernelDensityEstimator kernel, ModifiableDoubleDBIDList n) { 229 n.clear(); 230 NumberVector query = kernel.relation.get(dbid); 231 for(DoubleDBIDListIter neighbor = neighc.iter(); neighbor.valid(); neighbor.advance()) { 232 // TODO: use triangle inequality for pruning 233 double dist = df.distance(query, kernel.relation.get(neighbor)); 234 if(dist <= adjustedEps) { 235 n.add(dist, neighbor); 236 } 237 } 238 return n; 239 } 240 241 /** 242 * Subspace relevance test. 243 * 244 * @param subspace Subspace to test 245 * @param neigh Neighbor list 246 * @param kernel Kernel density estimator 247 * @return relevance test result 248 */ relevantSubspace(long[] subspace, DoubleDBIDList neigh, KernelDensityEstimator kernel)249 protected boolean relevantSubspace(long[] subspace, DoubleDBIDList neigh, KernelDensityEstimator kernel) { 250 final double crit = K_S_CRITICAL001 / FastMath.sqrt(neigh.size() - 2); 251 252 double[] data = new double[neigh.size()]; 253 Relation<? extends NumberVector> relation = kernel.relation; 254 for(int dim = BitsUtil.nextSetBit(subspace, 0); dim >= 0; dim = BitsUtil.nextSetBit(subspace, dim + 1)) { 255 // TODO: can/should we save this copy? 256 int count = 0; 257 for(DBIDIter neighbor = neigh.iter(); neighbor.valid(); neighbor.advance()) { 258 data[count++] = relation.get(neighbor).doubleValue(dim); 259 } 260 assert (count == neigh.size()); 261 Arrays.sort(data); 262 263 final double min = data[0], norm = data[data.length - 1] - min; 264 // Kolmogorow-Smirnow-Test against uniform distribution: 265 boolean flag = false; 266 for(int j = 1, end = data.length - 1; j < end; j++) { 267 if(Math.abs(j / (data.length - 2.) - (data[j] - min) / norm) > crit) { 268 flag = true; 269 break; 270 } 271 } 272 if(!flag) { 273 return false; 274 } 275 } 276 return true; 277 } 278 279 /** 280 * Kernel density estimation and utility class. 281 * 282 * @author Erich Schubert 283 */ 284 protected static class KernelDensityEstimator { 285 /** 286 * Actual kernel in use 287 */ 288 final KernelDensityFunction kernel = EpanechnikovKernelDensityFunction.KERNEL; 289 290 /** 291 * Relation to retrieve data from 292 */ 293 final Relation<? extends NumberVector> relation; 294 295 /** 296 * Epsilon values for different subspace dimensionalities 297 */ 298 final double[] epsilons; 299 300 /** 301 * Optimal bandwidth for a dimensionality of 2 302 */ 303 final double hopttwo; 304 305 /** 306 * Dimensionality of data set 307 */ 308 final int dim; 309 310 /** 311 * Constructor. 312 * 313 * @param relation Relation to apply to 314 */ KernelDensityEstimator(Relation<? extends NumberVector> relation, double eps)315 public KernelDensityEstimator(Relation<? extends NumberVector> relation, double eps) { 316 super(); 317 this.relation = relation; 318 dim = RelationUtil.dimensionality(relation); 319 hopttwo = optimalBandwidth(2); 320 epsilons = new double[dim + 1]; 321 Arrays.fill(epsilons, Double.NEGATIVE_INFINITY); 322 epsilons[2] = eps; 323 } 324 325 /** 326 * Compute density in the given subspace. 327 * 328 * @param subspace Subspace 329 * @param neighbors Neighbor distance list 330 * @return Density 331 */ subspaceDensity(long[] subspace, DoubleDBIDList neighbors)332 protected double subspaceDensity(long[] subspace, DoubleDBIDList neighbors) { 333 final double bandwidth = optimalBandwidth(BitsUtil.cardinality(subspace)); 334 double density = 0; 335 for(DoubleDBIDListIter neighbor = neighbors.iter(); neighbor.valid(); neighbor.advance()) { 336 double v = neighbor.doubleValue() / bandwidth; 337 density += v < 1 ? 1 - (v * v) : 0; 338 } 339 return density / relation.size(); 340 } 341 342 /** 343 * Compute optimal kernel bandwidth 344 * 345 * @param dim Dimensionality of subspace 346 * @return optimal bandwidth 347 */ 348 protected double optimalBandwidth(int dim) { 349 // Pi in the publication is redundant and cancels out! 350 double hopt = 8 * GammaDistribution.gamma(dim / 2.0 + 1) * (dim + 4) * MathUtil.powi(2, dim); 351 return hopt * FastMath.pow(relation.size(), (-1. / (dim + 4))); 352 } 353 354 /** 355 * Rescale the query radius based on the given dimensionality. 356 * 357 * @param dim Dimensionality 358 * @return Query radius 359 */ 360 protected double adjustedEps(int dim) { 361 // Cached 362 double e = epsilons[dim]; 363 if(e < 0) { 364 epsilons[dim] = e = epsilons[2] * optimalBandwidth(dim) / hopttwo; 365 } 366 return e; 367 } 368 } 369 370 @Override 371 protected Logging getLogger() { 372 return LOG; 373 } 374 375 @Override 376 public TypeInformation[] getInputTypeRestriction() { 377 return TypeUtil.array(TypeUtil.NUMBER_VECTOR_FIELD); 378 } 379 380 /** 381 * Parameterization class. 382 * 383 * @author Viktoria Pleintinger 384 */ 385 public static class Parameterizer extends AbstractParameterizer { 386 /** 387 * Option ID for Epsilon parameter 388 */ 389 public static final OptionID D_ID = new OptionID("outres.epsilon", "Range value for OUTRES in 2 dimensions."); 390 391 /** 392 * Query radius 393 */ 394 protected double eps; 395 396 @Override 397 protected void makeOptions(Parameterization config) { 398 super.makeOptions(config); 399 final DoubleParameter param = new DoubleParameter(D_ID); 400 if(config.grab(param)) { 401 eps = param.getValue(); 402 } 403 } 404 405 @Override 406 protected OUTRES makeInstance() { 407 return new OUTRES(eps); 408 } 409 } 410 } 411