1 /* 2 * This file is part of ELKI: 3 * Environment for Developing KDD-Applications Supported by Index-Structures 4 * 5 * Copyright (C) 2018 6 * ELKI Development Team 7 * 8 * This program is free software: you can redistribute it and/or modify 9 * it under the terms of the GNU Affero General Public License as published by 10 * the Free Software Foundation, either version 3 of the License, or 11 * (at your option) any later version. 12 * 13 * This program is distributed in the hope that it will be useful, 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 * GNU Affero General Public License for more details. 17 * 18 * You should have received a copy of the GNU Affero General Public License 19 * along with this program. If not, see <http://www.gnu.org/licenses/>. 20 */ 21 package de.lmu.ifi.dbs.elki.datasource.parser; 22 23 import java.util.ArrayList; 24 25 import de.lmu.ifi.dbs.elki.data.LabelList; 26 import de.lmu.ifi.dbs.elki.data.SparseFloatVector; 27 import de.lmu.ifi.dbs.elki.data.SparseNumberVector; 28 import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation; 29 import de.lmu.ifi.dbs.elki.data.type.VectorFieldTypeInformation; 30 import de.lmu.ifi.dbs.elki.data.type.VectorTypeInformation; 31 import de.lmu.ifi.dbs.elki.logging.Logging; 32 import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException; 33 import de.lmu.ifi.dbs.elki.utilities.io.ParseUtil; 34 import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; 35 import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; 36 import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.Flag; 37 import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; 38 39 import it.unimi.dsi.fastutil.ints.Int2DoubleMap; 40 import it.unimi.dsi.fastutil.ints.Int2DoubleOpenHashMap; 41 import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap; 42 import it.unimi.dsi.fastutil.objects.ObjectIterator; 43 44 /** 45 * A parser to load term frequency data, which essentially are sparse vectors 46 * with text keys. 47 * <p> 48 * Parse a file containing term frequencies. The expected format is: 49 * 50 * <pre> 51 * rowlabel1 term1 <freq> term2 <freq> ... 52 * rowlabel2 term1 <freq> term3 <freq> ... 53 * </pre> 54 * 55 * Terms must not contain the separator character! 56 * <p> 57 * If your data does not contain frequencies, you can maybe use 58 * {@link SimpleTransactionParser} instead. 59 * 60 * @author Erich Schubert 61 * @since 0.4.0 62 * 63 * @has - - - SparseNumberVector 64 */ 65 public class TermFrequencyParser<V extends SparseNumberVector> extends NumberVectorLabelParser<V> { 66 /** 67 * Class logger. 68 */ 69 private static final Logging LOG = Logging.getLogger(TermFrequencyParser.class); 70 71 /** 72 * Number of different terms observed. 73 */ 74 int numterms; 75 76 /** 77 * Map. 78 */ 79 Object2IntOpenHashMap<String> keymap; 80 81 /** 82 * Normalize. 83 */ 84 boolean normalize; 85 86 /** 87 * Same as {@link #factory}, but subtype. 88 */ 89 private SparseNumberVector.Factory<V> sparsefactory; 90 91 /** 92 * (Reused) set of values for the number vector. 93 */ 94 Int2DoubleOpenHashMap values = new Int2DoubleOpenHashMap(); 95 96 /** 97 * (Reused) label buffer. 98 */ 99 ArrayList<String> labels = new ArrayList<>(); 100 101 /** 102 * Constructor. 103 * 104 * @param normalize Normalize 105 * @param factory Vector type 106 */ TermFrequencyParser(boolean normalize, SparseNumberVector.Factory<V> factory)107 public TermFrequencyParser(boolean normalize, SparseNumberVector.Factory<V> factory) { 108 this(normalize, CSVReaderFormat.DEFAULT_FORMAT, null, factory); 109 } 110 111 /** 112 * Constructor. 113 * 114 * @param normalize Normalize 115 * @param format Input format 116 * @param labelIndices Indices to use as labels 117 * @param factory Vector type 118 */ TermFrequencyParser(boolean normalize, CSVReaderFormat format, long[] labelIndices, SparseNumberVector.Factory<V> factory)119 public TermFrequencyParser(boolean normalize, CSVReaderFormat format, long[] labelIndices, SparseNumberVector.Factory<V> factory) { 120 super(format, labelIndices, factory); 121 this.normalize = normalize; 122 this.keymap = new Object2IntOpenHashMap<>(); 123 this.keymap.defaultReturnValue(-1); 124 this.sparsefactory = factory; 125 } 126 127 @Override parseLineInternal()128 protected boolean parseLineInternal() { 129 double len = 0; 130 131 String curterm = null; 132 int c = 0; 133 for(/* initialized by nextLineExceptComments() */; tokenizer.valid(); tokenizer.advance()) { 134 if(isLabelColumn(c++)) { 135 labels.add(tokenizer.getSubstring()); 136 continue; 137 } 138 if(curterm == null) { 139 curterm = tokenizer.getSubstring(); 140 continue; 141 } 142 try { 143 double attribute = tokenizer.getDouble(); 144 int curdim = keymap.getInt(curterm); 145 if(curdim < 0) { 146 curdim = numterms; 147 keymap.put(curterm, curdim); 148 ++numterms; 149 } 150 values.put(curdim, attribute); 151 len += attribute; 152 curterm = null; 153 } 154 catch(NumberFormatException e) { 155 if(!warnedPrecision && (e == ParseUtil.PRECISION_OVERFLOW || e == ParseUtil.EXPONENT_OVERFLOW)) { 156 getLogger().warning("Too many digits in what looked like a double number - treating as string: " + tokenizer.getSubstring()); 157 warnedPrecision = true; 158 } 159 labels.add(curterm); 160 curterm = tokenizer.getSubstring(); 161 } 162 } 163 if(curterm != null) { 164 labels.add(curterm); 165 } 166 haslabels |= !labels.isEmpty(); 167 if(normalize && Math.abs(len - 1.0) > Double.MIN_NORMAL) { 168 for(ObjectIterator<Int2DoubleMap.Entry> iter = values.int2DoubleEntrySet().fastIterator(); iter.hasNext();) { 169 Int2DoubleMap.Entry entry = iter.next(); 170 entry.setValue(entry.getDoubleValue() / len); 171 } 172 } 173 174 curvec = sparsefactory.newNumberVector(values, numterms); 175 curlbl = LabelList.make(labels); 176 values.clear(); 177 labels.clear(); 178 return true; 179 } 180 181 @Override getTypeInformation(int mindim, int maxdim)182 protected SimpleTypeInformation<V> getTypeInformation(int mindim, int maxdim) { 183 if(mindim == maxdim) { 184 return new VectorFieldTypeInformation<>(factory, mindim); 185 } 186 else if(mindim < maxdim) { 187 return new VectorTypeInformation<>(factory, factory.getDefaultSerializer(), mindim, maxdim); 188 } 189 throw new AbortException("No vectors were read from the input file - cannot determine vector data type."); 190 } 191 192 @Override getLogger()193 protected Logging getLogger() { 194 return LOG; 195 } 196 197 /** 198 * Parameterization class. 199 * 200 * @author Erich Schubert 201 */ 202 public static class Parameterizer<V extends SparseNumberVector> extends NumberVectorLabelParser.Parameterizer<V> { 203 /** 204 * Option ID for normalization. 205 */ 206 public static final OptionID NORMALIZE_FLAG = new OptionID("tf.normalize", "Normalize vectors to manhattan length 1 (convert term counts to term frequencies)"); 207 208 /** 209 * Normalization flag. 210 */ 211 boolean normalize = false; 212 213 @Override makeOptions(Parameterization config)214 protected void makeOptions(Parameterization config) { 215 super.makeOptions(config); 216 Flag normF = new Flag(NORMALIZE_FLAG); 217 if(config.grab(normF)) { 218 normalize = normF.isTrue(); 219 } 220 } 221 222 @Override getFactory(Parameterization config)223 protected void getFactory(Parameterization config) { 224 ObjectParameter<SparseNumberVector.Factory<V>> factoryP = new ObjectParameter<>(VECTOR_TYPE_ID, SparseNumberVector.Factory.class, SparseFloatVector.Factory.class); 225 if(config.grab(factoryP)) { 226 factory = factoryP.instantiateClass(config); 227 } 228 } 229 230 @Override makeInstance()231 protected TermFrequencyParser<V> makeInstance() { 232 return new TermFrequencyParser<>(normalize, format, labelIndices, (SparseNumberVector.Factory<V>) factory); 233 } 234 } 235 } 236