1 /* 2 * This file is part of ELKI: 3 * Environment for Developing KDD-Applications Supported by Index-Structures 4 * 5 * Copyright (C) 2018 6 * ELKI Development Team 7 * 8 * This program is free software: you can redistribute it and/or modify 9 * it under the terms of the GNU Affero General Public License as published by 10 * the Free Software Foundation, either version 3 of the License, or 11 * (at your option) any later version. 12 * 13 * This program is distributed in the hope that it will be useful, 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 * GNU Affero General Public License for more details. 17 * 18 * You should have received a copy of the GNU Affero General Public License 19 * along with this program. If not, see <http://www.gnu.org/licenses/>. 20 */ 21 package de.lmu.ifi.dbs.elki.data; 22 23 import java.io.IOException; 24 import java.nio.ByteBuffer; 25 import java.util.Arrays; 26 27 import de.lmu.ifi.dbs.elki.utilities.datastructures.arraylike.ArrayAdapter; 28 import de.lmu.ifi.dbs.elki.utilities.datastructures.arraylike.NumberArrayAdapter; 29 import de.lmu.ifi.dbs.elki.utilities.io.ByteArrayUtil; 30 import de.lmu.ifi.dbs.elki.utilities.io.ByteBufferSerializer; 31 import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; 32 33 import it.unimi.dsi.fastutil.ints.Int2DoubleMap; 34 import it.unimi.dsi.fastutil.ints.Int2DoubleOpenHashMap; 35 import it.unimi.dsi.fastutil.objects.ObjectIterator; 36 37 /** 38 * Sparse vector type, using {@code short[]} for storing the values, and 39 * {@code int[]} for storing the indexes, approximately 6 bytes per non-zero 40 * value. 41 * 42 * @author Arthur Zimek 43 * @since 0.6.0 44 */ 45 public class SparseShortVector implements SparseNumberVector { 46 /** 47 * Static instance. 48 */ 49 public static final SparseShortVector.Factory FACTORY = new SparseShortVector.Factory(); 50 51 /** 52 * Serializer using varint encoding. 53 */ 54 public static final ByteBufferSerializer<SparseShortVector> VARIABLE_SERIALIZER = new VariableSerializer(); 55 56 /** 57 * Constant value, for use in (inefficient) {@link #getValue} API. 58 */ 59 private static final short SHORT0 = 0; 60 61 /** 62 * Indexes of values. 63 */ 64 private final int[] indexes; 65 66 /** 67 * Stored values. 68 */ 69 private final short[] values; 70 71 /** 72 * The dimensionality of this feature vector. 73 */ 74 private int dimensionality; 75 76 /** 77 * Direct constructor. 78 * 79 * @param indexes Indexes Must be sorted! 80 * @param values Associated value. 81 * @param dimensionality "true" dimensionality 82 */ SparseShortVector(int[] indexes, short[] values, int dimensionality)83 public SparseShortVector(int[] indexes, short[] values, int dimensionality) { 84 super(); 85 this.indexes = indexes; 86 this.values = values; 87 this.dimensionality = dimensionality; 88 } 89 90 /** 91 * Create a SparseShortVector consisting of double values according to the 92 * specified mapping of indices and values. 93 * 94 * @param values the values to be set as values of the real vector 95 * @param dimensionality the dimensionality of this feature vector 96 * @throws IllegalArgumentException if the given dimensionality is too small 97 * to cover the given values (i.e., the maximum index of any value not 98 * zero is bigger than the given dimensionality) 99 */ SparseShortVector(Int2DoubleOpenHashMap values, int dimensionality)100 public SparseShortVector(Int2DoubleOpenHashMap values, int dimensionality) throws IllegalArgumentException { 101 if(values.size() > dimensionality) { 102 throw new IllegalArgumentException("values.size() > dimensionality!"); 103 } 104 105 this.indexes = new int[values.size()]; 106 this.values = new short[values.size()]; 107 // Import and sort the indexes 108 { 109 ObjectIterator<Int2DoubleMap.Entry> iter = values.int2DoubleEntrySet().fastIterator(); 110 for(int i = 0; iter.hasNext(); i++) { 111 this.indexes[i] = iter.next().getIntKey(); 112 } 113 Arrays.sort(this.indexes); 114 } 115 // Import the values accordingly 116 { 117 for(int i = 0; i < values.size(); i++) { 118 this.values[i] = (short) values.get(this.indexes[i]); 119 } 120 } 121 this.dimensionality = dimensionality; 122 final int maxdim = getMaxDim(); 123 if(maxdim > dimensionality) { 124 throw new IllegalArgumentException("Given dimensionality " + dimensionality + " is too small w.r.t. the given values (occurring maximum: " + maxdim + ")."); 125 } 126 } 127 128 /** 129 * Get the maximum dimensionality. 130 * 131 * @return the maximum dimensionality seen 132 */ getMaxDim()133 private int getMaxDim() { 134 return (this.indexes.length == 0) ? 0 : this.indexes[this.indexes.length - 1]; 135 } 136 137 /** 138 * Create a SparseShortVector consisting of double values according to the 139 * specified mapping of indices and values. 140 * 141 * @param values the values to be set as values of the real vector 142 * @throws IllegalArgumentException if the given dimensionality is too small 143 * to cover the given values (i.e., the maximum index of any value not 144 * zero is bigger than the given dimensionality) 145 */ SparseShortVector(short[] values)146 public SparseShortVector(short[] values) throws IllegalArgumentException { 147 this.dimensionality = values.length; 148 149 // Count the number of non-zero entries 150 int size = 0; 151 { 152 for(int i = 0; i < values.length; i++) { 153 if(values[i] != 0) { 154 size++; 155 } 156 } 157 } 158 this.indexes = new int[size]; 159 this.values = new short[size]; 160 161 // Copy the values 162 { 163 int pos = 0; 164 for(int i = 0; i < values.length; i++) { 165 short value = values[i]; 166 if(value != 0) { 167 this.indexes[pos] = i; 168 this.values[pos] = value; 169 pos++; 170 } 171 } 172 } 173 } 174 175 @Override getDimensionality()176 public int getDimensionality() { 177 return dimensionality; 178 } 179 180 /** 181 * Sets the dimensionality to the new value. 182 * 183 * 184 * @param dimensionality the new dimensionality 185 * @throws IllegalArgumentException if the given dimensionality is too small 186 * to cover the given values (i.e., the maximum index of any value not 187 * zero is bigger than the given dimensionality) 188 */ 189 @Override setDimensionality(int dimensionality)190 public void setDimensionality(int dimensionality) throws IllegalArgumentException { 191 final int maxdim = getMaxDim(); 192 if(maxdim > dimensionality) { 193 throw new IllegalArgumentException("Given dimensionality " + dimensionality + " is too small w.r.t. the given values (occurring maximum: " + maxdim + ")."); 194 } 195 this.dimensionality = dimensionality; 196 } 197 198 @Override 199 @Deprecated getValue(int dimension)200 public Short getValue(int dimension) { 201 int pos = Arrays.binarySearch(this.indexes, dimension); 202 return (pos >= 0) ? values[pos] : SHORT0; 203 } 204 205 @Override 206 @Deprecated doubleValue(int dimension)207 public double doubleValue(int dimension) { 208 int pos = Arrays.binarySearch(this.indexes, dimension); 209 return (pos >= 0) ? values[pos] : 0.; 210 } 211 212 @Override 213 @Deprecated longValue(int dimension)214 public long longValue(int dimension) { 215 int pos = Arrays.binarySearch(this.indexes, dimension); 216 return (pos >= 0) ? values[pos] : 0L; 217 } 218 219 @Override 220 @Deprecated shortValue(int dimension)221 public short shortValue(int dimension) { 222 int pos = Arrays.binarySearch(this.indexes, dimension); 223 return (pos >= 0) ? values[pos] : 0; 224 } 225 226 @Override toArray()227 public double[] toArray() { 228 double[] vals = new double[dimensionality]; 229 for(int i = 0; i < indexes.length; i++) { 230 vals[this.indexes[i]] = this.values[i]; 231 } 232 return vals; 233 } 234 235 /** 236 * Create a String representation of this SparseShortVector as suitable for 237 * {@link de.lmu.ifi.dbs.elki.datasource.parser.SparseNumberVectorLabelParser} 238 * . 239 * 240 * The returned String is a single line with entries separated by 241 * {@link NumberVector#ATTRIBUTE_SEPARATOR}. The first entry gives the 242 * number of values actually not zero. Following entries are pairs of Short 243 * and Short where the Short gives the index of the dimensionality and the 244 * Short gives the corresponding value. 245 * 246 * Example: a vector (0,1.2,1.3,0)<sup>T</sup> would result in the String<br> 247 * <code>2 2 1.2 3 1.3</code><br> 248 * 249 * @return a String representation of this SparseShortVector 250 */ 251 @Override toString()252 public String toString() { 253 StringBuilder featureLine = new StringBuilder(10 * this.indexes.length)// 254 .append(this.indexes.length); 255 for(int i = 0; i < this.indexes.length; i++) { 256 featureLine.append(ATTRIBUTE_SEPARATOR).append(this.indexes[i])// 257 .append(ATTRIBUTE_SEPARATOR).append(this.values[i]); 258 } 259 return featureLine.toString(); 260 } 261 262 @Override iterDim(int iter)263 public int iterDim(int iter) { 264 return indexes[iter]; 265 } 266 267 @Override iterValid(int iter)268 public boolean iterValid(int iter) { 269 return iter < indexes.length; 270 } 271 272 @Override iterDoubleValue(int iter)273 public double iterDoubleValue(int iter) { 274 return (double) values[iter]; 275 } 276 277 @Override iterShortValue(int iter)278 public short iterShortValue(int iter) { 279 return values[iter]; 280 } 281 282 @Override iterLongValue(int iter)283 public long iterLongValue(int iter) { 284 return (long) values[iter]; 285 } 286 287 /** 288 * Factory class. 289 * 290 * @author Erich Schubert 291 * 292 * @has - - - SparseShortVector 293 */ 294 public static class Factory implements SparseNumberVector.Factory<SparseShortVector> { 295 @Override newFeatureVector(A array, ArrayAdapter<? extends Number, A> adapter)296 public <A> SparseShortVector newFeatureVector(A array, ArrayAdapter<? extends Number, A> adapter) { 297 int dim = adapter.size(array); 298 short[] values = new short[dim]; 299 for(int i = 0; i < dim; i++) { 300 values[i] = adapter.get(array, i).shortValue(); 301 } 302 // TODO: improve efficiency 303 return new SparseShortVector(values); 304 } 305 306 @Override newNumberVector(A array, NumberArrayAdapter<?, ? super A> adapter)307 public <A> SparseShortVector newNumberVector(A array, NumberArrayAdapter<?, ? super A> adapter) { 308 int dim = adapter.size(array); 309 short[] values = new short[dim]; 310 for(int i = 0; i < dim; i++) { 311 values[i] = adapter.getShort(array, i); 312 } 313 // TODO: improve efficiency 314 return new SparseShortVector(values); 315 } 316 317 @Override newNumberVector(Int2DoubleOpenHashMap values, int maxdim)318 public SparseShortVector newNumberVector(Int2DoubleOpenHashMap values, int maxdim) { 319 return new SparseShortVector(values, maxdim); 320 } 321 322 @Override getDefaultSerializer()323 public ByteBufferSerializer<SparseShortVector> getDefaultSerializer() { 324 return VARIABLE_SERIALIZER; 325 } 326 327 @Override getRestrictionClass()328 public Class<? super SparseShortVector> getRestrictionClass() { 329 return SparseShortVector.class; 330 } 331 332 /** 333 * Parameterization class. 334 * 335 * @author Erich Schubert 336 */ 337 public static class Parameterizer extends AbstractParameterizer { 338 @Override makeInstance()339 protected SparseShortVector.Factory makeInstance() { 340 return FACTORY; 341 } 342 } 343 } 344 345 /** 346 * Serialization class using VarInt encodings. 347 * 348 * @author Erich Schubert 349 * 350 * @assoc - serializes - SparseShortVector 351 */ 352 public static class VariableSerializer implements ByteBufferSerializer<SparseShortVector> { 353 @Override fromByteBuffer(ByteBuffer buffer)354 public SparseShortVector fromByteBuffer(ByteBuffer buffer) throws IOException { 355 final int dimensionality = ByteArrayUtil.readUnsignedVarint(buffer); 356 final int nonzero = ByteArrayUtil.readUnsignedVarint(buffer); 357 final int[] dims = new int[nonzero]; 358 final short[] values = new short[nonzero]; 359 for(int i = 0; i < nonzero; i++) { 360 dims[i] = ByteArrayUtil.readUnsignedVarint(buffer); 361 values[i] = (short) ByteArrayUtil.readSignedVarint(buffer); 362 } 363 return new SparseShortVector(dims, values, dimensionality); 364 } 365 366 @Override toByteBuffer(ByteBuffer buffer, SparseShortVector vec)367 public void toByteBuffer(ByteBuffer buffer, SparseShortVector vec) throws IOException { 368 ByteArrayUtil.writeUnsignedVarint(buffer, vec.dimensionality); 369 ByteArrayUtil.writeUnsignedVarint(buffer, vec.values.length); 370 for(int i = 0; i < vec.values.length; i++) { 371 ByteArrayUtil.writeUnsignedVarint(buffer, vec.indexes[i]); 372 ByteArrayUtil.writeSignedVarint(buffer, vec.values[i]); 373 } 374 } 375 376 @Override getByteSize(SparseShortVector vec)377 public int getByteSize(SparseShortVector vec) { 378 int sum = 0; 379 sum += ByteArrayUtil.getUnsignedVarintSize(vec.dimensionality); 380 sum += ByteArrayUtil.getUnsignedVarintSize(vec.values.length); 381 for(int i = 0; i < vec.values.length; i++) { 382 sum += ByteArrayUtil.getUnsignedVarintSize(vec.indexes[i]); 383 sum += ByteArrayUtil.getSignedVarintSize(vec.values[i]); 384 } 385 return sum; 386 } 387 } 388 } 389