1 /*
2  * This file is part of ELKI:
3  * Environment for Developing KDD-Applications Supported by Index-Structures
4  *
5  * Copyright (C) 2018
6  * ELKI Development Team
7  *
8  * This program is free software: you can redistribute it and/or modify
9  * it under the terms of the GNU Affero General Public License as published by
10  * the Free Software Foundation, either version 3 of the License, or
11  * (at your option) any later version.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  * GNU Affero General Public License for more details.
17  *
18  * You should have received a copy of the GNU Affero General Public License
19  * along with this program. If not, see <http://www.gnu.org/licenses/>.
20  */
21 package de.lmu.ifi.dbs.elki.datasource.parser;
22 
23 import java.util.ArrayList;
24 
25 import de.lmu.ifi.dbs.elki.data.LabelList;
26 import de.lmu.ifi.dbs.elki.data.SparseFloatVector;
27 import de.lmu.ifi.dbs.elki.data.SparseNumberVector;
28 import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation;
29 import de.lmu.ifi.dbs.elki.data.type.VectorFieldTypeInformation;
30 import de.lmu.ifi.dbs.elki.data.type.VectorTypeInformation;
31 import de.lmu.ifi.dbs.elki.logging.Logging;
32 import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException;
33 import de.lmu.ifi.dbs.elki.utilities.io.ParseUtil;
34 import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
35 import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
36 import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.Flag;
37 import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
38 
39 import it.unimi.dsi.fastutil.ints.Int2DoubleMap;
40 import it.unimi.dsi.fastutil.ints.Int2DoubleOpenHashMap;
41 import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap;
42 import it.unimi.dsi.fastutil.objects.ObjectIterator;
43 
44 /**
45  * A parser to load term frequency data, which essentially are sparse vectors
46  * with text keys.
47  * <p>
48  * Parse a file containing term frequencies. The expected format is:
49  *
50  * <pre>
51  * rowlabel1 term1 &lt;freq&gt; term2 &lt;freq&gt; ...
52  * rowlabel2 term1 &lt;freq&gt; term3 &lt;freq&gt; ...
53  * </pre>
54  *
55  * Terms must not contain the separator character!
56  * <p>
57  * If your data does not contain frequencies, you can maybe use
58  * {@link SimpleTransactionParser} instead.
59  *
60  * @author Erich Schubert
61  * @since 0.4.0
62  *
63  * @has - - - SparseNumberVector
64  */
65 public class TermFrequencyParser<V extends SparseNumberVector> extends NumberVectorLabelParser<V> {
66   /**
67    * Class logger.
68    */
69   private static final Logging LOG = Logging.getLogger(TermFrequencyParser.class);
70 
71   /**
72    * Number of different terms observed.
73    */
74   int numterms;
75 
76   /**
77    * Map.
78    */
79   Object2IntOpenHashMap<String> keymap;
80 
81   /**
82    * Normalize.
83    */
84   boolean normalize;
85 
86   /**
87    * Same as {@link #factory}, but subtype.
88    */
89   private SparseNumberVector.Factory<V> sparsefactory;
90 
91   /**
92    * (Reused) set of values for the number vector.
93    */
94   Int2DoubleOpenHashMap values = new Int2DoubleOpenHashMap();
95 
96   /**
97    * (Reused) label buffer.
98    */
99   ArrayList<String> labels = new ArrayList<>();
100 
101   /**
102    * Constructor.
103    *
104    * @param normalize Normalize
105    * @param factory Vector type
106    */
TermFrequencyParser(boolean normalize, SparseNumberVector.Factory<V> factory)107   public TermFrequencyParser(boolean normalize, SparseNumberVector.Factory<V> factory) {
108     this(normalize, CSVReaderFormat.DEFAULT_FORMAT, null, factory);
109   }
110 
111   /**
112    * Constructor.
113    *
114    * @param normalize Normalize
115    * @param format Input format
116    * @param labelIndices Indices to use as labels
117    * @param factory Vector type
118    */
TermFrequencyParser(boolean normalize, CSVReaderFormat format, long[] labelIndices, SparseNumberVector.Factory<V> factory)119   public TermFrequencyParser(boolean normalize, CSVReaderFormat format, long[] labelIndices, SparseNumberVector.Factory<V> factory) {
120     super(format, labelIndices, factory);
121     this.normalize = normalize;
122     this.keymap = new Object2IntOpenHashMap<>();
123     this.keymap.defaultReturnValue(-1);
124     this.sparsefactory = factory;
125   }
126 
127   @Override
parseLineInternal()128   protected boolean parseLineInternal() {
129     double len = 0;
130 
131     String curterm = null;
132     int c = 0;
133     for(/* initialized by nextLineExceptComments() */; tokenizer.valid(); tokenizer.advance()) {
134       if(isLabelColumn(c++)) {
135         labels.add(tokenizer.getSubstring());
136         continue;
137       }
138       if(curterm == null) {
139         curterm = tokenizer.getSubstring();
140         continue;
141       }
142       try {
143         double attribute = tokenizer.getDouble();
144         int curdim = keymap.getInt(curterm);
145         if(curdim < 0) {
146           curdim = numterms;
147           keymap.put(curterm, curdim);
148           ++numterms;
149         }
150         values.put(curdim, attribute);
151         len += attribute;
152         curterm = null;
153       }
154       catch(NumberFormatException e) {
155         if(!warnedPrecision && (e == ParseUtil.PRECISION_OVERFLOW || e == ParseUtil.EXPONENT_OVERFLOW)) {
156           getLogger().warning("Too many digits in what looked like a double number - treating as string: " + tokenizer.getSubstring());
157           warnedPrecision = true;
158         }
159         labels.add(curterm);
160         curterm = tokenizer.getSubstring();
161       }
162     }
163     if(curterm != null) {
164       labels.add(curterm);
165     }
166     haslabels |= !labels.isEmpty();
167     if(normalize && Math.abs(len - 1.0) > Double.MIN_NORMAL) {
168       for(ObjectIterator<Int2DoubleMap.Entry> iter = values.int2DoubleEntrySet().fastIterator(); iter.hasNext();) {
169         Int2DoubleMap.Entry entry = iter.next();
170         entry.setValue(entry.getDoubleValue() / len);
171       }
172     }
173 
174     curvec = sparsefactory.newNumberVector(values, numterms);
175     curlbl = LabelList.make(labels);
176     values.clear();
177     labels.clear();
178     return true;
179   }
180 
181   @Override
getTypeInformation(int mindim, int maxdim)182   protected SimpleTypeInformation<V> getTypeInformation(int mindim, int maxdim) {
183     if(mindim == maxdim) {
184       return new VectorFieldTypeInformation<>(factory, mindim);
185     }
186     else if(mindim < maxdim) {
187       return new VectorTypeInformation<>(factory, factory.getDefaultSerializer(), mindim, maxdim);
188     }
189     throw new AbortException("No vectors were read from the input file - cannot determine vector data type.");
190   }
191 
192   @Override
getLogger()193   protected Logging getLogger() {
194     return LOG;
195   }
196 
197   /**
198    * Parameterization class.
199    *
200    * @author Erich Schubert
201    */
202   public static class Parameterizer<V extends SparseNumberVector> extends NumberVectorLabelParser.Parameterizer<V> {
203     /**
204      * Option ID for normalization.
205      */
206     public static final OptionID NORMALIZE_FLAG = new OptionID("tf.normalize", "Normalize vectors to manhattan length 1 (convert term counts to term frequencies)");
207 
208     /**
209      * Normalization flag.
210      */
211     boolean normalize = false;
212 
213     @Override
makeOptions(Parameterization config)214     protected void makeOptions(Parameterization config) {
215       super.makeOptions(config);
216       Flag normF = new Flag(NORMALIZE_FLAG);
217       if(config.grab(normF)) {
218         normalize = normF.isTrue();
219       }
220     }
221 
222     @Override
getFactory(Parameterization config)223     protected void getFactory(Parameterization config) {
224       ObjectParameter<SparseNumberVector.Factory<V>> factoryP = new ObjectParameter<>(VECTOR_TYPE_ID, SparseNumberVector.Factory.class, SparseFloatVector.Factory.class);
225       if(config.grab(factoryP)) {
226         factory = factoryP.instantiateClass(config);
227       }
228     }
229 
230     @Override
makeInstance()231     protected TermFrequencyParser<V> makeInstance() {
232       return new TermFrequencyParser<>(normalize, format, labelIndices, (SparseNumberVector.Factory<V>) factory);
233     }
234   }
235 }
236