1 /* 2 * This file is part of ELKI: 3 * Environment for Developing KDD-Applications Supported by Index-Structures 4 * 5 * Copyright (C) 2018 6 * ELKI Development Team 7 * 8 * This program is free software: you can redistribute it and/or modify 9 * it under the terms of the GNU Affero General Public License as published by 10 * the Free Software Foundation, either version 3 of the License, or 11 * (at your option) any later version. 12 * 13 * This program is distributed in the hope that it will be useful, 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 * GNU Affero General Public License for more details. 17 * 18 * You should have received a copy of the GNU Affero General Public License 19 * along with this program. If not, see <http://www.gnu.org/licenses/>. 20 */ 21 package de.lmu.ifi.dbs.elki.data; 22 23 import java.io.IOException; 24 import java.nio.ByteBuffer; 25 import java.util.Arrays; 26 27 import de.lmu.ifi.dbs.elki.utilities.datastructures.arraylike.ArrayAdapter; 28 import de.lmu.ifi.dbs.elki.utilities.datastructures.arraylike.NumberArrayAdapter; 29 import de.lmu.ifi.dbs.elki.utilities.io.ByteArrayUtil; 30 import de.lmu.ifi.dbs.elki.utilities.io.ByteBufferSerializer; 31 import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; 32 33 import it.unimi.dsi.fastutil.ints.Int2DoubleMap; 34 import it.unimi.dsi.fastutil.ints.Int2DoubleOpenHashMap; 35 import it.unimi.dsi.fastutil.objects.ObjectIterator; 36 37 /** 38 * Sparse vector type, using {@code byte[]} for storing the values, and 39 * {@code int[]} for storing the indexes, approximately 5 bytes per non-zero 40 * value (limited to -128..+127). 41 * 42 * @author Arthur Zimek 43 * @since 0.6.0 44 */ 45 public class SparseByteVector implements SparseNumberVector { 46 /** 47 * Static instance. 48 */ 49 public static final SparseByteVector.Factory FACTORY = new SparseByteVector.Factory(); 50 51 /** 52 * Serializer using varint encoding. 53 */ 54 public static final ByteBufferSerializer<SparseByteVector> VARIABLE_SERIALIZER = new VariableSerializer(); 55 56 /** 57 * Indexes of values. 58 */ 59 private final int[] indexes; 60 61 /** 62 * Stored values. 63 */ 64 private final byte[] values; 65 66 /** 67 * The dimensionality of this feature vector. 68 */ 69 private int dimensionality; 70 71 /** 72 * Direct constructor. 73 * 74 * @param indexes Indexes Must be sorted! 75 * @param values Associated value. 76 * @param dimensionality "true" dimensionality 77 */ SparseByteVector(int[] indexes, byte[] values, int dimensionality)78 public SparseByteVector(int[] indexes, byte[] values, int dimensionality) { 79 super(); 80 this.indexes = indexes; 81 this.values = values; 82 this.dimensionality = dimensionality; 83 } 84 85 /** 86 * Create a SparseByteVector consisting of double values according to the 87 * specified mapping of indices and values. 88 * 89 * @param values the values to be set as values of the real vector 90 * @param dimensionality the dimensionality of this feature vector 91 * @throws IllegalArgumentException if the given dimensionality is too small 92 * to cover the given values (i.e., the maximum index of any value not 93 * zero is bigger than the given dimensionality) 94 */ SparseByteVector(Int2DoubleOpenHashMap values, int dimensionality)95 public SparseByteVector(Int2DoubleOpenHashMap values, int dimensionality) throws IllegalArgumentException { 96 if(values.size() > dimensionality) { 97 throw new IllegalArgumentException("values.size() > dimensionality!"); 98 } 99 100 this.indexes = new int[values.size()]; 101 this.values = new byte[values.size()]; 102 // Import and sort the indexes 103 { 104 ObjectIterator<Int2DoubleMap.Entry> iter = values.int2DoubleEntrySet().fastIterator(); 105 for(int i = 0; iter.hasNext(); i++) { 106 this.indexes[i] = iter.next().getIntKey(); 107 } 108 Arrays.sort(this.indexes); 109 } 110 // Import the values accordingly 111 { 112 for(int i = 0; i < values.size(); i++) { 113 this.values[i] = (byte) values.get(this.indexes[i]); 114 } 115 } 116 this.dimensionality = dimensionality; 117 final int maxdim = getMaxDim(); 118 if(maxdim > dimensionality) { 119 throw new IllegalArgumentException("Given dimensionality " + dimensionality + " is too small w.r.t. the given values (occurring maximum: " + maxdim + ")."); 120 } 121 } 122 123 /** 124 * Get the maximum dimensionality. 125 * 126 * @return the maximum dimensionality seen 127 */ getMaxDim()128 private int getMaxDim() { 129 return (this.indexes.length == 0) ? 0 : this.indexes[this.indexes.length - 1]; 130 } 131 132 /** 133 * Create a SparseByteVector consisting of double values according to the 134 * specified mapping of indices and values. 135 * 136 * @param values the values to be set as values of the real vector 137 * @throws IllegalArgumentException if the given dimensionality is too small 138 * to cover the given values (i.e., the maximum index of any value not 139 * zero is bigger than the given dimensionality) 140 */ SparseByteVector(byte[] values)141 public SparseByteVector(byte[] values) throws IllegalArgumentException { 142 this.dimensionality = values.length; 143 144 // Count the number of non-zero entries 145 int size = 0; 146 { 147 for(int i = 0; i < values.length; i++) { 148 if(values[i] != 0) { 149 size++; 150 } 151 } 152 } 153 this.indexes = new int[size]; 154 this.values = new byte[size]; 155 156 // Copy the values 157 { 158 int pos = 0; 159 for(int i = 0; i < values.length; i++) { 160 byte value = values[i]; 161 if(value != 0) { 162 this.indexes[pos] = i; 163 this.values[pos] = value; 164 pos++; 165 } 166 } 167 } 168 } 169 170 @Override getDimensionality()171 public int getDimensionality() { 172 return dimensionality; 173 } 174 175 /** 176 * Sets the dimensionality to the new value. 177 * 178 * 179 * @param dimensionality the new dimensionality 180 * @throws IllegalArgumentException if the given dimensionality is too small 181 * to cover the given values (i.e., the maximum index of any value not 182 * zero is bigger than the given dimensionality) 183 */ 184 @Override setDimensionality(int dimensionality)185 public void setDimensionality(int dimensionality) throws IllegalArgumentException { 186 final int maxdim = getMaxDim(); 187 if(maxdim > dimensionality) { 188 throw new IllegalArgumentException("Given dimensionality " + dimensionality + " is too small w.r.t. the given values (occurring maximum: " + maxdim + ")."); 189 } 190 this.dimensionality = dimensionality; 191 } 192 193 /** 194 * Value to return for missing entires. 195 */ 196 private static final Byte BYTE0 = Byte.valueOf((byte) 0); 197 198 @Override 199 @Deprecated getValue(int dimension)200 public Byte getValue(int dimension) { 201 int pos = Arrays.binarySearch(this.indexes, dimension); 202 return (pos >= 0) ? values[pos] : BYTE0; 203 } 204 205 @Override 206 @Deprecated doubleValue(int dimension)207 public double doubleValue(int dimension) { 208 int pos = Arrays.binarySearch(this.indexes, dimension); 209 return (pos >= 0) ? values[pos] : 0.; 210 } 211 212 @Override 213 @Deprecated longValue(int dimension)214 public long longValue(int dimension) { 215 int pos = Arrays.binarySearch(this.indexes, dimension); 216 return (pos >= 0) ? values[pos] : 0L; 217 } 218 219 @Override byteValue(int dimension)220 public byte byteValue(int dimension) { 221 int pos = Arrays.binarySearch(this.indexes, dimension); 222 return pos >= 0 ? values[pos] : 0; 223 } 224 225 @Override toArray()226 public double[] toArray() { 227 double[] vals = new double[dimensionality]; 228 for(int i = 0; i < indexes.length; i++) { 229 vals[this.indexes[i]] = this.values[i]; 230 } 231 return vals; 232 } 233 234 /** 235 * Create a String representation of this SparseByteVector as suitable for 236 * {@link de.lmu.ifi.dbs.elki.datasource.parser.SparseNumberVectorLabelParser} 237 * . 238 * <p> 239 * The returned String is a single line with entries separated by 240 * {@link NumberVector#ATTRIBUTE_SEPARATOR}. The first entry gives the 241 * number of values actually not zero. Following entries are pairs of Byte and 242 * Byte where the Byte gives the index of the dimensionality and the Byte 243 * gives the corresponding value. 244 * <p> 245 * Example: a vector (0,1.2,1.3,0)<sup>T</sup> would result in the String<br> 246 * <code>2 2 1.2 3 1.3</code> 247 * 248 * @return a String representation of this SparseByteVector 249 */ 250 @Override toString()251 public String toString() { 252 StringBuilder featureLine = new StringBuilder(8 * this.indexes.length)// 253 .append(this.indexes.length); 254 for(int i = 0; i < this.indexes.length; i++) { 255 featureLine.append(ATTRIBUTE_SEPARATOR).append(this.indexes[i])// 256 .append(ATTRIBUTE_SEPARATOR).append(this.values[i]); 257 } 258 return featureLine.toString(); 259 } 260 261 @Override iterDim(int iter)262 public int iterDim(int iter) { 263 return indexes[iter]; 264 } 265 266 @Override iterValid(int iter)267 public boolean iterValid(int iter) { 268 return iter < indexes.length; 269 } 270 271 @Override iterDoubleValue(int iter)272 public double iterDoubleValue(int iter) { 273 return (double) values[iter]; 274 } 275 276 @Override iterLongValue(int iter)277 public long iterLongValue(int iter) { 278 return (long) values[iter]; 279 } 280 281 @Override iterByteValue(int iter)282 public byte iterByteValue(int iter) { 283 return values[iter]; 284 } 285 286 /** 287 * Factory class. 288 * 289 * @author Erich Schubert 290 * 291 * @has - - - SparseByteVector 292 */ 293 public static class Factory implements SparseNumberVector.Factory<SparseByteVector> { 294 @Override newFeatureVector(A array, ArrayAdapter<? extends Number, A> adapter)295 public <A> SparseByteVector newFeatureVector(A array, ArrayAdapter<? extends Number, A> adapter) { 296 int dim = adapter.size(array); 297 byte[] values = new byte[dim]; 298 for(int i = 0; i < dim; i++) { 299 values[i] = adapter.get(array, i).byteValue(); 300 } 301 // TODO: improve efficiency 302 return new SparseByteVector(values); 303 } 304 305 @Override newNumberVector(A array, NumberArrayAdapter<?, ? super A> adapter)306 public <A> SparseByteVector newNumberVector(A array, NumberArrayAdapter<?, ? super A> adapter) { 307 int dim = adapter.size(array); 308 byte[] values = new byte[dim]; 309 for(int i = 0; i < dim; i++) { 310 values[i] = adapter.getByte(array, i); 311 } 312 // TODO: improve efficiency 313 return new SparseByteVector(values); 314 } 315 316 @Override newNumberVector(Int2DoubleOpenHashMap values, int maxdim)317 public SparseByteVector newNumberVector(Int2DoubleOpenHashMap values, int maxdim) { 318 return new SparseByteVector(values, maxdim); 319 } 320 321 @Override getDefaultSerializer()322 public ByteBufferSerializer<SparseByteVector> getDefaultSerializer() { 323 return VARIABLE_SERIALIZER; 324 } 325 326 @Override getRestrictionClass()327 public Class<? super SparseByteVector> getRestrictionClass() { 328 return SparseByteVector.class; 329 } 330 331 /** 332 * Parameterization class. 333 * 334 * @author Erich Schubert 335 */ 336 public static class Parameterizer extends AbstractParameterizer { 337 @Override makeInstance()338 protected SparseByteVector.Factory makeInstance() { 339 return FACTORY; 340 } 341 } 342 } 343 344 /** 345 * Serialization class using VarInt encodings. 346 * 347 * @author Erich Schubert 348 * 349 * @assoc - serializes - SparseByteVector 350 */ 351 public static class VariableSerializer implements ByteBufferSerializer<SparseByteVector> { 352 @Override fromByteBuffer(ByteBuffer buffer)353 public SparseByteVector fromByteBuffer(ByteBuffer buffer) throws IOException { 354 final int dimensionality = ByteArrayUtil.readUnsignedVarint(buffer); 355 final int nonzero = ByteArrayUtil.readUnsignedVarint(buffer); 356 final int[] dims = new int[nonzero]; 357 final byte[] values = new byte[nonzero]; 358 for(int i = 0; i < nonzero; i++) { 359 dims[i] = ByteArrayUtil.readUnsignedVarint(buffer); 360 values[i] = buffer.get(); 361 } 362 return new SparseByteVector(dims, values, dimensionality); 363 } 364 365 @Override toByteBuffer(ByteBuffer buffer, SparseByteVector vec)366 public void toByteBuffer(ByteBuffer buffer, SparseByteVector vec) throws IOException { 367 ByteArrayUtil.writeUnsignedVarint(buffer, vec.dimensionality); 368 ByteArrayUtil.writeUnsignedVarint(buffer, vec.values.length); 369 for(int i = 0; i < vec.values.length; i++) { 370 ByteArrayUtil.writeUnsignedVarint(buffer, vec.indexes[i]); 371 buffer.put(vec.values[i]); 372 } 373 } 374 375 @Override getByteSize(SparseByteVector vec)376 public int getByteSize(SparseByteVector vec) { 377 int sum = 0; 378 sum += ByteArrayUtil.getUnsignedVarintSize(vec.dimensionality); 379 sum += ByteArrayUtil.getUnsignedVarintSize(vec.values.length); 380 for(int i = 0; i < vec.values.length; i++) { 381 sum += ByteArrayUtil.getUnsignedVarintSize(vec.indexes[i]); 382 ++sum; 383 } 384 return sum; 385 } 386 } 387 } 388