1 /*
2  * This file is part of ELKI:
3  * Environment for Developing KDD-Applications Supported by Index-Structures
4  *
5  * Copyright (C) 2018
6  * ELKI Development Team
7  *
8  * This program is free software: you can redistribute it and/or modify
9  * it under the terms of the GNU Affero General Public License as published by
10  * the Free Software Foundation, either version 3 of the License, or
11  * (at your option) any later version.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  * GNU Affero General Public License for more details.
17  *
18  * You should have received a copy of the GNU Affero General Public License
19  * along with this program. If not, see <http://www.gnu.org/licenses/>.
20  */
21 package de.lmu.ifi.dbs.elki.algorithm.outlier.subspace;
22 
23 import java.util.Arrays;
24 
25 import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm;
26 import de.lmu.ifi.dbs.elki.algorithm.outlier.OutlierAlgorithm;
27 import de.lmu.ifi.dbs.elki.data.NumberVector;
28 import de.lmu.ifi.dbs.elki.data.type.TypeInformation;
29 import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
30 import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory;
31 import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil;
32 import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore;
33 import de.lmu.ifi.dbs.elki.database.ids.*;
34 import de.lmu.ifi.dbs.elki.database.relation.MaterializedDoubleRelation;
35 import de.lmu.ifi.dbs.elki.database.relation.Relation;
36 import de.lmu.ifi.dbs.elki.database.relation.RelationUtil;
37 import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction;
38 import de.lmu.ifi.dbs.elki.distance.distancefunction.subspace.SubspaceEuclideanDistanceFunction;
39 import de.lmu.ifi.dbs.elki.logging.Logging;
40 import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress;
41 import de.lmu.ifi.dbs.elki.math.DoubleMinMax;
42 import de.lmu.ifi.dbs.elki.math.MathUtil;
43 import de.lmu.ifi.dbs.elki.math.MeanVariance;
44 import de.lmu.ifi.dbs.elki.math.statistics.distribution.GammaDistribution;
45 import de.lmu.ifi.dbs.elki.math.statistics.kernelfunctions.EpanechnikovKernelDensityFunction;
46 import de.lmu.ifi.dbs.elki.math.statistics.kernelfunctions.KernelDensityFunction;
47 import de.lmu.ifi.dbs.elki.result.outlier.InvertedOutlierScoreMeta;
48 import de.lmu.ifi.dbs.elki.result.outlier.OutlierResult;
49 import de.lmu.ifi.dbs.elki.result.outlier.OutlierScoreMeta;
50 import de.lmu.ifi.dbs.elki.utilities.datastructures.BitsUtil;
51 import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
52 import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
53 import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
54 import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
55 import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter;
56 
57 import net.jafama.FastMath;
58 
59 /**
60  * Adaptive outlierness for subspace outlier ranking (OUTRES).
61  * <p>
62  * Note: this algorithm seems to have a O(n³d!) complexity with no obvious way
63  * to accelerate it with usual index structures for range queries: each object
64  * in each tested subspace will need to know the mean and standard deviation of
65  * the density of the neighbors, which in turn needs another range query; except
66  * if we precomputed the densities for each of O(d!) possible subsets of
67  * dimensions.
68  * <p>
69  * Reference:
70  * <p>
71  * E. Müller, M. Schiffer, T. Seidl<br>
72  * Adaptive outlierness for subspace outlier ranking<br>
73  * Proc. 19th ACM Int. Conf. on Information and Knowledge Management
74  *
75  * @author Viktoria Pleintinger
76  * @author Erich Schubert
77  * @since 0.5.0
78  *
79  * @composed - - - KernelDensityEstimator
80  */
81 @Reference(authors = "E. Müller, M. Schiffer, T. Seidl", //
82     title = "Adaptive outlierness for subspace outlier ranking", //
83     booktitle = "Proc. 19th ACM Int. Conf. on Information and Knowledge Management", //
84     url = "https://doi.org/10.1145/1871437.1871690", //
85     bibkey = "DBLP:conf/cikm/MullerSS10")
86 public class OUTRES extends AbstractAlgorithm<OutlierResult> implements OutlierAlgorithm {
87   /**
88    * The logger for this class.
89    */
90   private static final Logging LOG = Logging.getLogger(OUTRES.class);
91 
92   /**
93    * The epsilon (in 2d) parameter
94    */
95   private final double eps;
96 
97   /**
98    * Constant for Kolmogorov-Smirnov at alpha=0.01 (table value)
99    */
100   private static final double K_S_CRITICAL001 = 1.63;
101 
102   /**
103    * Constructor.
104    *
105    * @param eps Epsilon
106    */
OUTRES(double eps)107   public OUTRES(double eps) {
108     super();
109     this.eps = eps;
110   }
111 
112   /**
113    * Main loop for OUTRES
114    *
115    * @param relation Relation to process
116    * @return Outlier detection result
117    */
run(Relation<? extends NumberVector> relation)118   public OutlierResult run(Relation<? extends NumberVector> relation) {
119     final DBIDs ids = relation.getDBIDs();
120     WritableDoubleDataStore ranks = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_STATIC);
121     DoubleMinMax minmax = new DoubleMinMax();
122 
123     KernelDensityEstimator kernel = new KernelDensityEstimator(relation, eps);
124     long[] subspace = BitsUtil.zero(kernel.dim);
125 
126     FiniteProgress progress = LOG.isVerbose() ? new FiniteProgress("OUTRES scores", ids.size(), LOG) : null;
127 
128     for(DBIDIter iditer = ids.iter(); iditer.valid(); iditer.advance()) {
129       BitsUtil.zeroI(subspace);
130       double score = outresScore(0, subspace, iditer, kernel, ids);
131       ranks.putDouble(iditer, score);
132       minmax.put(score);
133       LOG.incrementProcessed(progress);
134     }
135     LOG.ensureCompleted(progress);
136 
137     OutlierScoreMeta meta = new InvertedOutlierScoreMeta(minmax.getMin(), minmax.getMax(), 0., 1., 1.);
138     return new OutlierResult(meta, new MaterializedDoubleRelation("OUTRES", "outres-score", ranks, ids));
139   }
140 
141   /**
142    * Main loop of OUTRES. Run for each object
143    *
144    * @param s start dimension
145    * @param subspace Current subspace
146    * @param id Current object ID
147    * @param kernel Kernel
148    * @param cands neighbor candidates
149    * @return Score
150    */
outresScore(final int s, long[] subspace, DBIDRef id, KernelDensityEstimator kernel, DBIDs cands)151   public double outresScore(final int s, long[] subspace, DBIDRef id, KernelDensityEstimator kernel, DBIDs cands) {
152     double score = 1.0; // Initial score is 1.0
153     final SubspaceEuclideanDistanceFunction df = new SubspaceEuclideanDistanceFunction(subspace);
154     MeanVariance meanv = new MeanVariance();
155     ModifiableDoubleDBIDList neighcand = DBIDUtil.newDistanceDBIDList(cands.size());
156     ModifiableDoubleDBIDList nn = DBIDUtil.newDistanceDBIDList(cands.size());
157 
158     for(int i = s; i < kernel.dim; i++) {
159       assert !BitsUtil.get(subspace, i);
160       BitsUtil.setI(subspace, i);
161       df.setSelectedDimensions(subspace);
162       final double adjustedEps = kernel.adjustedEps(kernel.dim);
163       DoubleDBIDList neigh = initialRange(id, cands, df, adjustedEps * 2, kernel, neighcand);
164       // Relevance test
165       if(neigh.size() > 2) {
166         if(relevantSubspace(subspace, neigh, kernel)) {
167           final double density = kernel.subspaceDensity(subspace, neigh);
168           // Compute mean and standard deviation for densities of neighbors.
169           meanv.reset();
170           for(DoubleDBIDListIter neighbor = neigh.iter(); neighbor.valid(); neighbor.advance()) {
171             subsetNeighborhoodQuery(neighcand, neighbor, df, adjustedEps, kernel, nn);
172             meanv.put(kernel.subspaceDensity(subspace, nn));
173           }
174           final double deviation = (meanv.getMean() - density) / (2. * meanv.getSampleStddev());
175           // High deviation:
176           if(deviation >= 1) {
177             score *= density / deviation;
178           }
179           // Recursion
180           score *= outresScore(i + 1, subspace, id, kernel, neighcand);
181         }
182       }
183       BitsUtil.clearI(subspace, i);
184     }
185     return score;
186   }
187 
188   /**
189    * Initial range query.
190    *
191    * @param obj Object
192    * @param cands Candidates
193    * @param df Distance function
194    * @param eps Epsilon radius
195    * @param kernel Kernel
196    * @param n Output buffer
197    * @return Neighbors
198    */
initialRange(DBIDRef obj, DBIDs cands, PrimitiveDistanceFunction<? super NumberVector> df, double eps, KernelDensityEstimator kernel, ModifiableDoubleDBIDList n)199   private DoubleDBIDList initialRange(DBIDRef obj, DBIDs cands, PrimitiveDistanceFunction<? super NumberVector> df, double eps, KernelDensityEstimator kernel, ModifiableDoubleDBIDList n) {
200     n.clear();
201     NumberVector o = kernel.relation.get(obj);
202     final double twoeps = eps * 2;
203     int matches = 0;
204     for(DBIDIter cand = cands.iter(); cand.valid(); cand.advance()) {
205       final double dist = df.distance(o, kernel.relation.get(cand));
206       if(dist <= twoeps) {
207         n.add(dist, cand);
208         if(dist <= eps) {
209           ++matches;
210         }
211       }
212     }
213     n.sort();
214     return n.slice(0, matches);
215   }
216 
217   /**
218    * Refine neighbors within a subset.
219    *
220    * @param neighc Neighbor candidates
221    * @param dbid Query object
222    * @param df distance function
223    * @param adjustedEps Epsilon range
224    * @param kernel Kernel
225    * @param n Output list
226    * @return Neighbors of neighbor object
227    */
subsetNeighborhoodQuery(DoubleDBIDList neighc, DBIDRef dbid, PrimitiveDistanceFunction<? super NumberVector> df, double adjustedEps, KernelDensityEstimator kernel, ModifiableDoubleDBIDList n)228   private DoubleDBIDList subsetNeighborhoodQuery(DoubleDBIDList neighc, DBIDRef dbid, PrimitiveDistanceFunction<? super NumberVector> df, double adjustedEps, KernelDensityEstimator kernel, ModifiableDoubleDBIDList n) {
229     n.clear();
230     NumberVector query = kernel.relation.get(dbid);
231     for(DoubleDBIDListIter neighbor = neighc.iter(); neighbor.valid(); neighbor.advance()) {
232       // TODO: use triangle inequality for pruning
233       double dist = df.distance(query, kernel.relation.get(neighbor));
234       if(dist <= adjustedEps) {
235         n.add(dist, neighbor);
236       }
237     }
238     return n;
239   }
240 
241   /**
242    * Subspace relevance test.
243    *
244    * @param subspace Subspace to test
245    * @param neigh Neighbor list
246    * @param kernel Kernel density estimator
247    * @return relevance test result
248    */
relevantSubspace(long[] subspace, DoubleDBIDList neigh, KernelDensityEstimator kernel)249   protected boolean relevantSubspace(long[] subspace, DoubleDBIDList neigh, KernelDensityEstimator kernel) {
250     final double crit = K_S_CRITICAL001 / FastMath.sqrt(neigh.size() - 2);
251 
252     double[] data = new double[neigh.size()];
253     Relation<? extends NumberVector> relation = kernel.relation;
254     for(int dim = BitsUtil.nextSetBit(subspace, 0); dim >= 0; dim = BitsUtil.nextSetBit(subspace, dim + 1)) {
255       // TODO: can/should we save this copy?
256       int count = 0;
257       for(DBIDIter neighbor = neigh.iter(); neighbor.valid(); neighbor.advance()) {
258         data[count++] = relation.get(neighbor).doubleValue(dim);
259       }
260       assert (count == neigh.size());
261       Arrays.sort(data);
262 
263       final double min = data[0], norm = data[data.length - 1] - min;
264       // Kolmogorow-Smirnow-Test against uniform distribution:
265       boolean flag = false;
266       for(int j = 1, end = data.length - 1; j < end; j++) {
267         if(Math.abs(j / (data.length - 2.) - (data[j] - min) / norm) > crit) {
268           flag = true;
269           break;
270         }
271       }
272       if(!flag) {
273         return false;
274       }
275     }
276     return true;
277   }
278 
279   /**
280    * Kernel density estimation and utility class.
281    *
282    * @author Erich Schubert
283    */
284   protected static class KernelDensityEstimator {
285     /**
286      * Actual kernel in use
287      */
288     final KernelDensityFunction kernel = EpanechnikovKernelDensityFunction.KERNEL;
289 
290     /**
291      * Relation to retrieve data from
292      */
293     final Relation<? extends NumberVector> relation;
294 
295     /**
296      * Epsilon values for different subspace dimensionalities
297      */
298     final double[] epsilons;
299 
300     /**
301      * Optimal bandwidth for a dimensionality of 2
302      */
303     final double hopttwo;
304 
305     /**
306      * Dimensionality of data set
307      */
308     final int dim;
309 
310     /**
311      * Constructor.
312      *
313      * @param relation Relation to apply to
314      */
KernelDensityEstimator(Relation<? extends NumberVector> relation, double eps)315     public KernelDensityEstimator(Relation<? extends NumberVector> relation, double eps) {
316       super();
317       this.relation = relation;
318       dim = RelationUtil.dimensionality(relation);
319       hopttwo = optimalBandwidth(2);
320       epsilons = new double[dim + 1];
321       Arrays.fill(epsilons, Double.NEGATIVE_INFINITY);
322       epsilons[2] = eps;
323     }
324 
325     /**
326      * Compute density in the given subspace.
327      *
328      * @param subspace Subspace
329      * @param neighbors Neighbor distance list
330      * @return Density
331      */
subspaceDensity(long[] subspace, DoubleDBIDList neighbors)332     protected double subspaceDensity(long[] subspace, DoubleDBIDList neighbors) {
333       final double bandwidth = optimalBandwidth(BitsUtil.cardinality(subspace));
334       double density = 0;
335       for(DoubleDBIDListIter neighbor = neighbors.iter(); neighbor.valid(); neighbor.advance()) {
336         double v = neighbor.doubleValue() / bandwidth;
337         density += v < 1 ? 1 - (v * v) : 0;
338       }
339       return density / relation.size();
340     }
341 
342     /**
343      * Compute optimal kernel bandwidth
344      *
345      * @param dim Dimensionality of subspace
346      * @return optimal bandwidth
347      */
348     protected double optimalBandwidth(int dim) {
349       // Pi in the publication is redundant and cancels out!
350       double hopt = 8 * GammaDistribution.gamma(dim / 2.0 + 1) * (dim + 4) * MathUtil.powi(2, dim);
351       return hopt * FastMath.pow(relation.size(), (-1. / (dim + 4)));
352     }
353 
354     /**
355      * Rescale the query radius based on the given dimensionality.
356      *
357      * @param dim Dimensionality
358      * @return Query radius
359      */
360     protected double adjustedEps(int dim) {
361       // Cached
362       double e = epsilons[dim];
363       if(e < 0) {
364         epsilons[dim] = e = epsilons[2] * optimalBandwidth(dim) / hopttwo;
365       }
366       return e;
367     }
368   }
369 
370   @Override
371   protected Logging getLogger() {
372     return LOG;
373   }
374 
375   @Override
376   public TypeInformation[] getInputTypeRestriction() {
377     return TypeUtil.array(TypeUtil.NUMBER_VECTOR_FIELD);
378   }
379 
380   /**
381    * Parameterization class.
382    *
383    * @author Viktoria Pleintinger
384    */
385   public static class Parameterizer extends AbstractParameterizer {
386     /**
387      * Option ID for Epsilon parameter
388      */
389     public static final OptionID D_ID = new OptionID("outres.epsilon", "Range value for OUTRES in 2 dimensions.");
390 
391     /**
392      * Query radius
393      */
394     protected double eps;
395 
396     @Override
397     protected void makeOptions(Parameterization config) {
398       super.makeOptions(config);
399       final DoubleParameter param = new DoubleParameter(D_ID);
400       if(config.grab(param)) {
401         eps = param.getValue();
402       }
403     }
404 
405     @Override
406     protected OUTRES makeInstance() {
407       return new OUTRES(eps);
408     }
409   }
410 }
411