1 /*
2 * KeyListOpsMethods.cpp
3 *
4 * Created on: Feb 6, 2014
5 * Author: nek3d
6 */
7
8 #include "KeyListOpsMethods.h"
9 #include <cmath>
10 #include <algorithm>
11 #include <limits.h>
12 #include "ParseTools.h" //to get the isNumeric function
13
KeyListOpsMethods()14 KeyListOpsMethods::KeyListOpsMethods()
15 : _keyList(&_nullKeyList),
16 _column(1),
17 _nullVal("."),
18 _delimStr(","),
19 _iter(_nullKeyList.begin()),
20 _nonNumErrFlag(false),
21 _isBam(false)
22 {
23 }
24
KeyListOpsMethods(RecordKeyVector * keyList,int column)25 KeyListOpsMethods::KeyListOpsMethods(RecordKeyVector *keyList, int column)
26 : _keyList(keyList),
27 _column(column),
28 _nullVal("."),
29 _delimStr(","),
30 _iter(keyList->begin())
31 {
32 }
33
34
~KeyListOpsMethods()35 KeyListOpsMethods::~KeyListOpsMethods() {
36
37 }
38
39 // return the total of the values in the vector
getSum()40 double KeyListOpsMethods::getSum() {
41 if (empty()) return NAN;
42
43 double theSum = 0.0;
44 for (begin(); !end(); next()) {
45 theSum += getColValNum();
46 }
47 return theSum;
48 }
49
50 // return the average value in the vector
getMean()51 double KeyListOpsMethods::getMean() {
52 if (empty()) return NAN;
53
54 return getSum() / (float)getCount();
55 }
56
57
58 // return the standard deviation
getStddev()59 double KeyListOpsMethods::getStddev() {
60 if (empty()) return NAN;
61
62 double avg = getMean();
63 double squareDiffSum = 0.0;
64 for (begin(); !end(); next()) {
65 double val = getColValNum();
66 double diff = val - avg;
67 squareDiffSum += diff * diff;
68 }
69 return sqrt(squareDiffSum / (float)getCount());
70 }
71 // return the standard deviation
getSampleStddev()72 double KeyListOpsMethods::getSampleStddev() {
73 if (empty()) return NAN;
74
75 double avg = getMean();
76 double squareDiffSum = 0.0;
77 for (begin(); !end(); next()) {
78 double val = getColValNum();
79 double diff = val - avg;
80 squareDiffSum += diff * diff;
81 }
82 return sqrt(squareDiffSum / ((float)getCount() - 1.0));
83 }
84
85 // return the median value in the vector
getMedian()86 double KeyListOpsMethods::getMedian() {
87 if (empty()) return NAN;
88
89 //get sorted vector. if even number of elems, return middle val.
90 //if odd, average of two.
91 toArray(true, ASC);
92 size_t count = getCount();
93 if (count % 2) {
94 //odd number of elements. Take middle one.
95 return _numArray[count/2];
96 } else {
97 //even numnber of elements. Take average of middle 2.
98 double sum = _numArray[count/2 -1] + _numArray[count/2];
99 return sum / 2.0;
100 }
101 }
102
103 // return the most common value in the vector
getMode()104 const string &KeyListOpsMethods::getMode() {
105 if (empty()) return _nullVal;
106
107 makeFreqMap();
108
109 //now pass through the freq map and keep track of which key has the highest occurance.
110 freqMapType::iterator maxIter = _freqMap.begin();
111 int maxVal = 0;
112 for (; _freqIter != _freqMap.end(); _freqIter++) {
113 if (_freqIter->second > maxVal) {
114 maxIter = _freqIter;
115 maxVal = _freqIter->second;
116 }
117 }
118 _retStr = maxIter->first;
119 return _retStr;
120 }
121 // return the least common value in the vector
getAntiMode()122 const string &KeyListOpsMethods::getAntiMode() {
123 if (empty()) return _nullVal;
124
125 makeFreqMap();
126
127 //now pass through the freq map and keep track of which key has the highest occurance.
128 freqMapType::iterator minIter = _freqMap.begin();
129 int minVal = INT_MAX;
130 for (; _freqIter != _freqMap.end(); _freqIter++) {
131 if (_freqIter->second < minVal) {
132 minIter = _freqIter;
133 minVal = _freqIter->second;
134 }
135 }
136 _retStr = minIter->first;
137 return _retStr;
138 }
139 // return the minimum element of the vector
getMin()140 double KeyListOpsMethods::getMin() {
141 if (empty()) return NAN;
142
143 begin();
144 double minVal = getColValNum();
145 for (; !end(); next()) {
146 double currVal = getColValNum();
147 minVal = (currVal < minVal) ? currVal : minVal;
148 }
149 return minVal;
150 }
151
152 // return the maximum element of the vector
getMax()153 double KeyListOpsMethods::getMax() {
154 if (empty()) return NAN;
155
156 begin();
157 double maxVal = getColValNum();
158 for (; !end(); next()) {
159 double currVal = getColValNum();
160 maxVal = (currVal > maxVal) ? currVal : maxVal;
161 }
162 return maxVal;
163 }
164
165 // return the minimum absolute value of the vector
getAbsMin()166 double KeyListOpsMethods::getAbsMin() {
167 if (empty()) return NAN;
168
169 begin();
170 double minVal = abs(getColValNum());
171 for (; !end(); next()) {
172 double currVal = abs(getColValNum());
173 minVal = (currVal < minVal) ? currVal : minVal;
174 }
175 return minVal;
176 }
177 // return the maximum absolute value of the vector
getAbsMax()178 double KeyListOpsMethods::getAbsMax() {
179 if (empty()) return NAN;
180
181 begin();
182 double maxVal = abs(getColValNum());
183 for (; !end(); next()) {
184 double currVal = abs(getColValNum());
185 maxVal = (currVal > maxVal) ? currVal : maxVal;
186 }
187 return maxVal;
188 }
189 // return the count of element in the vector
getCount()190 uint32_t KeyListOpsMethods::getCount() {
191 return _keyList->size();
192 }
193 // return a delimited list of the unique elements
getDistinct()194 const string &KeyListOpsMethods::getDistinct() {
195 if (empty()) return _nullVal;
196 // separated list of unique values. If something repeats, only report once.
197 makeFreqMap();
198 _retStr.clear();
199 for (; _freqIter != _freqMap.end(); _freqIter++) {
200 if (_freqIter != _freqMap.begin()) _retStr += _delimStr;
201 _retStr.append(_freqIter->first);
202 }
203 return _retStr;
204 }
205
getDistinctOnly()206 const string &KeyListOpsMethods::getDistinctOnly() {
207 if (empty()) return _nullVal;
208 // separated list of unique values. If something repeats, don't report.
209 makeFreqMap();
210 _retStr.clear();
211 for (; _freqIter != _freqMap.end(); _freqIter++) {
212 if (_freqIter->second > 1) continue;
213 if (_freqIter != _freqMap.begin()) _retStr += _delimStr;
214 _retStr.append(_freqIter->first);
215 }
216 return _retStr;
217 }
218
getDistinctSortNum(bool asc)219 const string &KeyListOpsMethods::getDistinctSortNum(bool asc) {
220 if (empty()) return _nullVal;
221
222 toArray(true, asc ? ASC : DESC);
223 vector<double>::iterator endIter = std::unique(_numArray.begin(), _numArray.end());
224
225 _retStr.clear();
226 ostringstream s;
227 for (vector<double>::iterator iter = _numArray.begin(); iter != endIter; iter++) {
228 if (iter != _numArray.begin()) s << _delimStr;
229 s << *iter;
230 }
231 _retStr.append(s.str());
232 return _retStr;
233
234 }
235
236
237 // return a the count of _unique_ elements in the vector
getCountDistinct()238 uint32_t KeyListOpsMethods::getCountDistinct() {
239 if (empty()) return 0;
240
241 makeFreqMap();
242 return _freqMap.size();
243 }
244
245 // return a delimiter-separated list of elements
getCollapse(const string & delimiter)246 const string &KeyListOpsMethods::getCollapse(const string &delimiter) {
247 if (empty()) return _nullVal;
248
249 //just put all items in one big separated list.
250 _retStr.clear();
251 int i=0;
252 for (begin(); !end(); next()) {
253 if (i > 0) _retStr += _delimStr;
254 _retStr.append(getColVal());
255 i++;
256 }
257 return _retStr;
258 }
259
260 // return a concatenation of all elements in the vector
getConcat()261 const string &KeyListOpsMethods::getConcat() {
262 if (empty()) return _nullVal;
263
264 //like collapse but w/o commas. Just a true concat of all vals.
265 //just swap out the delimChar with '' and call collapse, then
266 //restore the delimChar.
267 string oldDelimStr(_delimStr);
268 _delimStr = "";
269 getCollapse(); //this will store it's results in the _retStr method.
270 _delimStr = oldDelimStr;
271 return _retStr;
272 }
273
274 // return a histogram of values and their freqs. in desc. order of frequency
getFreqDesc()275 const string &KeyListOpsMethods::getFreqDesc() {
276 if (empty()) return _nullVal;
277
278 //for each uniq val, report # occurances, in desc order.
279 makeFreqMap();
280 //put freq map into multimap where key is the freq and val is the item. In other words, basically a reverse freq map.
281 histDescType hist;
282 for (; _freqIter != _freqMap.end(); _freqIter++) {
283 hist.insert(pair<int, string>(_freqIter->second, _freqIter->first));
284 }
285 //now iterate through the reverse map we just made and output it's pairs in val:key format.
286 _retStr.clear();
287 ostringstream s;
288 for (histDescType::iterator histIter = hist.begin(); histIter != hist.end(); histIter++) {
289 if (histIter != hist.begin()) s << _delimStr;
290 s << histIter->second;
291 s << ":";
292 s << histIter->first;
293 }
294 _retStr.append(s.str());
295 return _retStr;
296 }
297 // return a histogram of values and their freqs. in asc. order of frequency
getFreqAsc()298 const string &KeyListOpsMethods::getFreqAsc() {
299 if (empty()) return _nullVal;
300
301 //for each uniq val, report # occurances, in asc order.
302 makeFreqMap();
303 //put freq map into multimap where key is the freq and val is the item. In other words, basically a reverse freq map.
304 histAscType hist;
305 for (; _freqIter != _freqMap.end(); _freqIter++) {
306 hist.insert(pair<int, string>(_freqIter->second, _freqIter->first));
307 // hist[*(_freqIter->second)] = _freqIter->first;
308 }
309 //now iterate through the reverse map we just made and output it's pairs in val:key format.
310 _retStr.clear();
311 ostringstream s;
312 for (histAscType::iterator histIter = hist.begin(); histIter != hist.end(); histIter++) {
313 if (histIter != hist.begin()) s << _delimStr;
314 s << histIter->second;
315 s << ":";
316 s << histIter->first;
317 }
318 _retStr.append(s.str());
319 return _retStr;
320 }
321 // return the first value in the list
getFirst()322 const string &KeyListOpsMethods::getFirst() {
323 if (empty()) return _nullVal;
324
325 //just the first item.
326 begin();
327 return getColVal();
328 }
329 // return the last value in the list
getLast()330 const string &KeyListOpsMethods::getLast() {
331 if (empty()) return _nullVal;
332
333 //just the last item.
334 begin();
335 for (size_t i = 0; i < getCount() -1; i++) {
336 next();
337 }
338 return getColVal();
339 }
340
getColVal()341 const string &KeyListOpsMethods::getColVal() {
342 const string &retVal = (*_iter)->getField(_column);
343 if (_isBam && retVal.empty()) return _nullVal;
344 return retVal;
345 }
346
getColValNum()347 double KeyListOpsMethods::getColValNum() {
348 const string &strVal = (*_iter)->getField(_column);
349 if (!isNumeric(strVal)) {
350 _nonNumErrFlag = true;
351 ostringstream s;
352 _errMsg = " ***** WARNING: Non numeric value ";
353 s << strVal;
354 s << " in ";
355 s << _column;
356 s << ".";
357 _errMsg.append(s.str());
358 return NAN;
359 }
360 return atof(strVal.c_str());
361 }
362
toArray(bool useNum,SORT_TYPE sortVal)363 void KeyListOpsMethods::toArray(bool useNum, SORT_TYPE sortVal) {
364
365 //TBD: optimize performance with better memory management.
366 if (useNum) {
367 _numArray.resize(_keyList->size());
368 int i=0;
369 for (begin(); !end(); next()) {
370 _numArray[i] = getColValNum();
371 i++;
372 }
373 } else {
374 _qsArray.resize(_keyList->size());
375 int i=0;
376 for (begin(); !end(); next()) {
377 _qsArray[i] = getColVal();
378 i++;
379 }
380 }
381 if (sortVal != UNSORTED) {
382 sortArray(useNum, sortVal == ASC);
383 }
384 }
385
sortArray(bool useNum,bool ascOrder)386 void KeyListOpsMethods::sortArray(bool useNum, bool ascOrder)
387 {
388 if (useNum) {
389 if (ascOrder) {
390 sort(_numArray.begin(), _numArray.end(), less<double>());
391 } else {
392 sort(_numArray.begin(), _numArray.end(), greater<double>());
393 }
394 } else {
395 if (ascOrder) {
396 sort(_qsArray.begin(), _qsArray.end(), less<string>());
397 } else {
398 sort(_qsArray.begin(), _qsArray.end(), greater<string>());
399 }
400 }
401 }
402
makeFreqMap()403 void KeyListOpsMethods::makeFreqMap() {
404 _freqMap.clear();
405
406 //make a map of values to their number of times occuring.
407 for (begin(); !end(); next()) {
408 _freqMap[getColVal()]++;
409 }
410 _freqIter = _freqMap.begin();
411 }
412