1 /* 2 * This file is part of ELKI: 3 * Environment for Developing KDD-Applications Supported by Index-Structures 4 * 5 * Copyright (C) 2018 6 * ELKI Development Team 7 * 8 * This program is free software: you can redistribute it and/or modify 9 * it under the terms of the GNU Affero General Public License as published by 10 * the Free Software Foundation, either version 3 of the License, or 11 * (at your option) any later version. 12 * 13 * This program is distributed in the hope that it will be useful, 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 * GNU Affero General Public License for more details. 17 * 18 * You should have received a copy of the GNU Affero General Public License 19 * along with this program. If not, see <http://www.gnu.org/licenses/>. 20 */ 21 package de.lmu.ifi.dbs.elki.distance.distancefunction.set; 22 23 import de.lmu.ifi.dbs.elki.data.BitVector; 24 import de.lmu.ifi.dbs.elki.data.FeatureVector; 25 import de.lmu.ifi.dbs.elki.data.NumberVector; 26 import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation; 27 import de.lmu.ifi.dbs.elki.database.query.DistanceSimilarityQuery; 28 import de.lmu.ifi.dbs.elki.database.query.distance.PrimitiveDistanceSimilarityQuery; 29 import de.lmu.ifi.dbs.elki.database.relation.Relation; 30 import de.lmu.ifi.dbs.elki.distance.distancefunction.NumberVectorDistanceFunction; 31 import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; 32 import de.lmu.ifi.dbs.elki.distance.similarityfunction.NormalizedPrimitiveSimilarityFunction; 33 import de.lmu.ifi.dbs.elki.utilities.Alias; 34 import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; 35 36 /** 37 * A flexible extension of Jaccard similarity to non-binary vectors. 38 * <p> 39 * Jaccard coefficient is commonly defined as \(\frac{A\cap B}{A\cup B}\). 40 * <p> 41 * We can extend this definition to non-binary vectors as follows: 42 * \(\tfrac{|\{i\mid a_i = b_i\}|}{|\{i\mid a_i = 0 \wedge b_i = 0\}|}\) 43 * <p> 44 * For binary vectors, this will obviously be the same quantity. However, this 45 * version is more useful for categorical data. 46 * <p> 47 * Reference: 48 * <p> 49 * P. Jaccard<br> 50 * Distribution de la florine alpine dans la Bassin de Dranses et dans quelques 51 * regiones voisines<br> 52 * Bulletin del la Société Vaudoise des Sciences Naturelles 53 * 54 * @author Erich Schubert 55 * @since 0.6.0 56 */ 57 @Reference(authors = "P. Jaccard", // 58 title = "Distribution de la florine alpine dans la Bassin de Dranses et dans quelques regiones voisines", // 59 booktitle = "Bulletin del la Société Vaudoise des Sciences Naturelles", // 60 url = "http://data.rero.ch/01-R241574160", // 61 bibkey = "journals/misc/Jaccard1902") 62 @Alias("de.lmu.ifi.dbs.elki.distance.similarityfunction.JaccardPrimitiveSimilarityFunction") 63 public class JaccardSimilarityDistanceFunction extends AbstractSetDistanceFunction<FeatureVector<?>> implements NormalizedPrimitiveSimilarityFunction<FeatureVector<?>>, NumberVectorDistanceFunction<FeatureVector<?>>, PrimitiveDistanceFunction<FeatureVector<?>> { 64 /** 65 * Constructor. No parameters. 66 */ JaccardSimilarityDistanceFunction()67 public JaccardSimilarityDistanceFunction() { 68 super(); 69 } 70 71 @Override similarity(FeatureVector<?> o1, FeatureVector<?> o2)72 public double similarity(FeatureVector<?> o1, FeatureVector<?> o2) { 73 if(o1 instanceof BitVector && o2 instanceof BitVector) { 74 return ((BitVector) o1).jaccardSimilarity((BitVector) o2); 75 } 76 if(o1 instanceof NumberVector && o2 instanceof NumberVector) { 77 return similarityNumberVector((NumberVector) o1, (NumberVector) o2); 78 } 79 final int d1 = o1.getDimensionality(), d2 = o2.getDimensionality(); 80 int intersection = 0, union = 0; 81 int d = 0; 82 for(; d < d1 && d < d2; d++) { 83 Object v1 = o1.getValue(d), v2 = o2.getValue(d); 84 final boolean n1 = isNull(v1), n2 = isNull(v2); 85 if(v1 instanceof Double && Double.isNaN((Double) v1) // 86 || v2 instanceof Double && Double.isNaN((Double) v2)) { 87 continue; 88 } 89 if(!n1 || !n2) { 90 ++union; 91 if(!n1 && v1.equals(v2)) { 92 ++intersection; 93 } 94 } 95 } 96 for(; d < d1; d++) { 97 Object v1 = o1.getValue(d); 98 if(!isNull(v1)) { 99 if(v1 instanceof Double && Double.isNaN((Double) v1)) { 100 continue; 101 } 102 ++union; 103 } 104 } 105 for(; d < d2; d++) { 106 Object v2 = o2.getValue(d); 107 if(!isNull(v2)) { 108 if(v2 instanceof Double && Double.isNaN((Double) v2)) { 109 continue; 110 } 111 ++union; 112 } 113 } 114 return intersection / (double) union; 115 } 116 117 /** 118 * Compute Jaccard similarity for two number vectors. 119 * 120 * @param o1 First vector 121 * @param o2 Second vector 122 * @return Jaccard similarity 123 */ similarityNumberVector(NumberVector o1, NumberVector o2)124 public static double similarityNumberVector(NumberVector o1, NumberVector o2) { 125 final int d1 = o1.getDimensionality(), d2 = o2.getDimensionality(); 126 int intersection = 0, union = 0; 127 int d = 0; 128 for(; d < d1 && d < d2; d++) { 129 double v1 = o1.doubleValue(d), v2 = o2.doubleValue(d); 130 if(v1 != v1 || v2 != v2) { // Skip NaNs. 131 continue; 132 } 133 if(v1 != 0. || v2 != 0) { 134 ++union; 135 if(v1 == v2) { 136 ++intersection; 137 } 138 } 139 } 140 for(; d < d1; d++) { 141 if(o1.doubleValue(d) != 0) { 142 ++union; 143 } 144 } 145 for(; d < d2; d++) { 146 if(o2.doubleValue(d) != 0) { 147 ++union; 148 } 149 } 150 return intersection / (double) union; 151 } 152 153 @Override distance(FeatureVector<?> o1, FeatureVector<?> o2)154 public double distance(FeatureVector<?> o1, FeatureVector<?> o2) { 155 return 1. - similarity(o1, o2); 156 } 157 158 @Override distance(NumberVector o1, NumberVector o2)159 public double distance(NumberVector o1, NumberVector o2) { 160 if(o1 instanceof BitVector && o2 instanceof BitVector) { 161 return 1. - ((BitVector) o1).jaccardSimilarity((BitVector) o2); 162 } 163 return 1. - similarityNumberVector(o1, o2); 164 } 165 166 @Override isSymmetric()167 public boolean isSymmetric() { 168 return true; 169 } 170 171 @Override isMetric()172 public boolean isMetric() { 173 return true; 174 } 175 176 @Override getInputTypeRestriction()177 public SimpleTypeInformation<? super FeatureVector<?>> getInputTypeRestriction() { 178 return FeatureVector.TYPE; 179 } 180 181 @Override instantiate(Relation<T> relation)182 public <T extends FeatureVector<?>> DistanceSimilarityQuery<T> instantiate(Relation<T> relation) { 183 return new PrimitiveDistanceSimilarityQuery<>(relation, this, this); 184 } 185 186 @Override equals(Object obj)187 public boolean equals(Object obj) { 188 return obj == this || (obj != null && this.getClass().equals(obj.getClass())); 189 } 190 191 @Override hashCode()192 public int hashCode() { 193 return getClass().hashCode(); 194 } 195 } 196