1 /* 2 * This file is part of ELKI: 3 * Environment for Developing KDD-Applications Supported by Index-Structures 4 * 5 * Copyright (C) 2018 6 * ELKI Development Team 7 * 8 * This program is free software: you can redistribute it and/or modify 9 * it under the terms of the GNU Affero General Public License as published by 10 * the Free Software Foundation, either version 3 of the License, or 11 * (at your option) any later version. 12 * 13 * This program is distributed in the hope that it will be useful, 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 * GNU Affero General Public License for more details. 17 * 18 * You should have received a copy of the GNU Affero General Public License 19 * along with this program. If not, see <http://www.gnu.org/licenses/>. 20 */ 21 package de.lmu.ifi.dbs.elki.data; 22 23 import java.io.IOException; 24 import java.nio.ByteBuffer; 25 import java.util.Arrays; 26 27 import de.lmu.ifi.dbs.elki.utilities.datastructures.arraylike.ArrayAdapter; 28 import de.lmu.ifi.dbs.elki.utilities.datastructures.arraylike.NumberArrayAdapter; 29 import de.lmu.ifi.dbs.elki.utilities.io.ByteArrayUtil; 30 import de.lmu.ifi.dbs.elki.utilities.io.ByteBufferSerializer; 31 import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; 32 33 import it.unimi.dsi.fastutil.ints.Int2DoubleMap; 34 import it.unimi.dsi.fastutil.ints.Int2DoubleOpenHashMap; 35 import it.unimi.dsi.fastutil.ints.Int2FloatMap; 36 import it.unimi.dsi.fastutil.ints.Int2FloatOpenHashMap; 37 import it.unimi.dsi.fastutil.objects.ObjectIterator; 38 39 /** 40 * Sparse vector type, using {@code float[]} for storing the values, and 41 * {@code int[]} for storing the indexes, approximately 8 bytes per non-zero 42 * value. 43 * 44 * @author Arthur Zimek 45 * @since 0.2 46 */ 47 public class SparseFloatVector implements SparseNumberVector { 48 /** 49 * Static instance. 50 */ 51 public static final SparseFloatVector.Factory FACTORY = new SparseFloatVector.Factory(); 52 53 /** 54 * Serializer using varint encoding. 55 */ 56 public static final ByteBufferSerializer<SparseFloatVector> VARIABLE_SERIALIZER = new VariableSerializer(); 57 58 /** 59 * Float constant, for missing values in (inefficient) {@link #getValue} API. 60 */ 61 private static final float FLOAT0 = 0.f; 62 63 /** 64 * Indexes of values. 65 */ 66 private final int[] indexes; 67 68 /** 69 * Stored values. 70 */ 71 private final float[] values; 72 73 /** 74 * The dimensionality of this feature vector. 75 */ 76 private int dimensionality; 77 78 /** 79 * Direct constructor. 80 * 81 * @param indexes Indexes Must be sorted! 82 * @param values Associated value. 83 * @param dimensionality "true" dimensionality 84 */ SparseFloatVector(int[] indexes, float[] values, int dimensionality)85 public SparseFloatVector(int[] indexes, float[] values, int dimensionality) { 86 super(); 87 this.indexes = indexes; 88 this.values = values; 89 this.dimensionality = dimensionality; 90 } 91 92 /** 93 * Create a SparseFloatVector consisting of double values according to the 94 * specified mapping of indices and values. 95 * 96 * @param values the values to be set as values of the real vector 97 * @param dimensionality the dimensionality of this feature vector 98 * @throws IllegalArgumentException if the given dimensionality is too small 99 * to cover the given values (i.e., the maximum index of any value not 100 * zero is bigger than the given dimensionality) 101 */ SparseFloatVector(Int2FloatOpenHashMap values, int dimensionality)102 public SparseFloatVector(Int2FloatOpenHashMap values, int dimensionality) throws IllegalArgumentException { 103 if(values.size() > dimensionality) { 104 throw new IllegalArgumentException("values.size() > dimensionality!"); 105 } 106 107 this.indexes = new int[values.size()]; 108 this.values = new float[values.size()]; 109 // Import and sort the indexes 110 { 111 ObjectIterator<Int2FloatMap.Entry> iter = values.int2FloatEntrySet().fastIterator(); 112 for(int i = 0; iter.hasNext(); i++) { 113 this.indexes[i] = iter.next().getIntKey(); 114 } 115 Arrays.sort(this.indexes); 116 } 117 // Import the values accordingly 118 { 119 for(int i = 0; i < values.size(); i++) { 120 this.values[i] = values.get(this.indexes[i]); 121 } 122 } 123 this.dimensionality = dimensionality; 124 final int maxdim = getMaxDim(); 125 if(maxdim > dimensionality) { 126 throw new IllegalArgumentException("Given dimensionality " + dimensionality + " is too small w.r.t. the given values (occurring maximum: " + maxdim + ")."); 127 } 128 } 129 130 /** 131 * Get the maximum dimensionality. 132 * 133 * @return the maximum dimensionality seen 134 */ getMaxDim()135 private int getMaxDim() { 136 return (this.indexes.length == 0) ? 0 : this.indexes[this.indexes.length - 1]; 137 } 138 139 /** 140 * Create a SparseFloatVector consisting of double values according to the 141 * specified mapping of indices and values. 142 * 143 * @param values the values to be set as values of the real vector 144 * @throws IllegalArgumentException if the given dimensionality is too small 145 * to cover the given values (i.e., the maximum index of any value not 146 * zero is bigger than the given dimensionality) 147 */ SparseFloatVector(float[] values)148 public SparseFloatVector(float[] values) throws IllegalArgumentException { 149 this.dimensionality = values.length; 150 151 // Count the number of non-zero entries 152 int size = 0; 153 { 154 for(int i = 0; i < values.length; i++) { 155 if(values[i] != 0.0f) { 156 size++; 157 } 158 } 159 } 160 this.indexes = new int[size]; 161 this.values = new float[size]; 162 163 // Copy the values 164 { 165 int pos = 0; 166 for(int i = 0; i < values.length; i++) { 167 float value = values[i]; 168 if(value != 0.0f) { 169 this.indexes[pos] = i; 170 this.values[pos] = value; 171 pos++; 172 } 173 } 174 } 175 } 176 177 @Override getDimensionality()178 public int getDimensionality() { 179 return dimensionality; 180 } 181 182 /** 183 * Sets the dimensionality to the new value. 184 * 185 * 186 * @param dimensionality the new dimensionality 187 * @throws IllegalArgumentException if the given dimensionality is too small 188 * to cover the given values (i.e., the maximum index of any value not 189 * zero is bigger than the given dimensionality) 190 */ 191 @Override setDimensionality(int dimensionality)192 public void setDimensionality(int dimensionality) throws IllegalArgumentException { 193 final int maxdim = getMaxDim(); 194 if(maxdim > dimensionality) { 195 throw new IllegalArgumentException("Given dimensionality " + dimensionality + " is too small w.r.t. the given values (occurring maximum: " + maxdim + ")."); 196 } 197 this.dimensionality = dimensionality; 198 } 199 200 @Override 201 @Deprecated getValue(int dimension)202 public Float getValue(int dimension) { 203 int pos = Arrays.binarySearch(this.indexes, dimension); 204 return (pos >= 0) ? values[pos] : FLOAT0; 205 } 206 207 @Override 208 @Deprecated doubleValue(int dimension)209 public double doubleValue(int dimension) { 210 int pos = Arrays.binarySearch(this.indexes, dimension); 211 return (pos >= 0) ? values[pos] : 0.; 212 } 213 214 @Override 215 @Deprecated floatValue(int dimension)216 public float floatValue(int dimension) { 217 int pos = Arrays.binarySearch(this.indexes, dimension); 218 return (pos >= 0) ? values[pos] : 0.f; 219 } 220 221 @Override 222 @Deprecated longValue(int dimension)223 public long longValue(int dimension) { 224 int pos = Arrays.binarySearch(this.indexes, dimension); 225 return (pos >= 0) ? (long) values[pos] : 0L; 226 } 227 228 @Override toArray()229 public double[] toArray() { 230 double[] vals = new double[dimensionality]; 231 for(int i = 0; i < indexes.length; i++) { 232 vals[this.indexes[i]] = this.values[i]; 233 } 234 return vals; 235 } 236 237 /** 238 * Create a String representation of this SparseFloatVector as suitable for 239 * {@link de.lmu.ifi.dbs.elki.datasource.parser.SparseNumberVectorLabelParser} 240 * . 241 * 242 * The returned String is a single line with entries separated by 243 * {@link NumberVector#ATTRIBUTE_SEPARATOR}. The first entry gives the 244 * number of values actually not zero. Following entries are pairs of Integer 245 * and Float where the Integer gives the index of the dimensionality and the 246 * Float gives the corresponding value. 247 * 248 * Example: a vector (0,1.2,1.3,0)<sup>T</sup> would result in the String<br> 249 * <code>2 2 1.2 3 1.3</code><br> 250 * 251 * @return a String representation of this SparseFloatVector 252 */ 253 @Override toString()254 public String toString() { 255 StringBuilder featureLine = new StringBuilder(15 * this.indexes.length)// 256 .append(this.indexes.length); 257 for(int i = 0; i < this.indexes.length; i++) { 258 featureLine.append(ATTRIBUTE_SEPARATOR).append(this.indexes[i])// 259 .append(ATTRIBUTE_SEPARATOR).append(this.values[i]); 260 } 261 return featureLine.toString(); 262 } 263 264 @Override iterDim(int iter)265 public int iterDim(int iter) { 266 return indexes[iter]; 267 } 268 269 @Override iterValid(int iter)270 public boolean iterValid(int iter) { 271 return iter < indexes.length; 272 } 273 274 @Override iterDoubleValue(int iter)275 public double iterDoubleValue(int iter) { 276 return (double) values[iter]; 277 } 278 279 @Override iterFloatValue(int iter)280 public float iterFloatValue(int iter) { 281 return values[iter]; 282 } 283 284 @Override iterLongValue(int iter)285 public long iterLongValue(int iter) { 286 return (long) values[iter]; 287 } 288 289 /** 290 * Factory class. 291 * 292 * @author Erich Schubert 293 * 294 * @has - - - SparseFloatVector 295 */ 296 public static class Factory implements SparseNumberVector.Factory<SparseFloatVector> { 297 @Override newFeatureVector(A array, ArrayAdapter<? extends Number, A> adapter)298 public <A> SparseFloatVector newFeatureVector(A array, ArrayAdapter<? extends Number, A> adapter) { 299 int dim = adapter.size(array); 300 float[] values = new float[dim]; 301 for(int i = 0; i < dim; i++) { 302 values[i] = adapter.get(array, i).floatValue(); 303 } 304 // TODO: inefficient 305 return new SparseFloatVector(values); 306 } 307 308 @Override newNumberVector(A array, NumberArrayAdapter<?, ? super A> adapter)309 public <A> SparseFloatVector newNumberVector(A array, NumberArrayAdapter<?, ? super A> adapter) { 310 int dim = adapter.size(array); 311 float[] values = new float[dim]; 312 for(int i = 0; i < dim; i++) { 313 values[i] = adapter.getFloat(array, i); 314 } 315 // TODO: inefficient 316 return new SparseFloatVector(values); 317 } 318 319 @Override newNumberVector(Int2DoubleOpenHashMap dvalues, int maxdim)320 public SparseFloatVector newNumberVector(Int2DoubleOpenHashMap dvalues, int maxdim) { 321 int[] indexes = new int[dvalues.size()]; 322 float[] values = new float[dvalues.size()]; 323 // Import and sort the indexes 324 ObjectIterator<Int2DoubleMap.Entry> iter = dvalues.int2DoubleEntrySet().fastIterator(); 325 for(int i = 0; iter.hasNext(); i++) { 326 indexes[i] = iter.next().getIntKey(); 327 } 328 Arrays.sort(indexes); 329 // Import the values accordingly 330 for(int i = 0; i < dvalues.size(); i++) { 331 values[i] = (float) dvalues.get(indexes[i]); 332 } 333 return new SparseFloatVector(indexes, values, maxdim); 334 } 335 336 @Override getDefaultSerializer()337 public ByteBufferSerializer<SparseFloatVector> getDefaultSerializer() { 338 return VARIABLE_SERIALIZER; 339 } 340 341 @Override getRestrictionClass()342 public Class<? super SparseFloatVector> getRestrictionClass() { 343 return SparseFloatVector.class; 344 } 345 346 /** 347 * Parameterization class. 348 * 349 * @author Erich Schubert 350 */ 351 public static class Parameterizer extends AbstractParameterizer { 352 @Override makeInstance()353 protected SparseFloatVector.Factory makeInstance() { 354 return FACTORY; 355 } 356 } 357 } 358 359 /** 360 * Serialization class using VarInt encodings. 361 * 362 * @author Erich Schubert 363 * 364 * @assoc - serializes - SparseFloatVector 365 */ 366 public static class VariableSerializer implements ByteBufferSerializer<SparseFloatVector> { 367 @Override fromByteBuffer(ByteBuffer buffer)368 public SparseFloatVector fromByteBuffer(ByteBuffer buffer) throws IOException { 369 final int dimensionality = ByteArrayUtil.readUnsignedVarint(buffer); 370 final int nonzero = ByteArrayUtil.readUnsignedVarint(buffer); 371 final int[] dims = new int[nonzero]; 372 final float[] values = new float[nonzero]; 373 for(int i = 0; i < nonzero; i++) { 374 dims[i] = ByteArrayUtil.readUnsignedVarint(buffer); 375 values[i] = buffer.getFloat(); 376 } 377 return new SparseFloatVector(dims, values, dimensionality); 378 } 379 380 @Override toByteBuffer(ByteBuffer buffer, SparseFloatVector vec)381 public void toByteBuffer(ByteBuffer buffer, SparseFloatVector vec) throws IOException { 382 ByteArrayUtil.writeUnsignedVarint(buffer, vec.dimensionality); 383 ByteArrayUtil.writeUnsignedVarint(buffer, vec.values.length); 384 for(int i = 0; i < vec.values.length; i++) { 385 ByteArrayUtil.writeUnsignedVarint(buffer, vec.indexes[i]); 386 buffer.putFloat(vec.values[i]); 387 } 388 } 389 390 @Override getByteSize(SparseFloatVector vec)391 public int getByteSize(SparseFloatVector vec) { 392 int sum = 0; 393 sum += ByteArrayUtil.getUnsignedVarintSize(vec.dimensionality); 394 sum += ByteArrayUtil.getUnsignedVarintSize(vec.values.length); 395 for(int d : vec.indexes) { 396 sum += ByteArrayUtil.getUnsignedVarintSize(d); 397 } 398 sum += vec.values.length * ByteArrayUtil.SIZE_FLOAT; 399 return sum; 400 } 401 } 402 } 403