1 /*
2  * KeyListOpsMethods.cpp
3  *
4  *  Created on: Feb 6, 2014
5  *      Author: nek3d
6  */
7 
8 #include "KeyListOpsMethods.h"
9 #include <cmath>
10 #include <algorithm>
11 #include <limits.h>
12 #include "ParseTools.h" //to get the isNumeric function
13 
KeyListOpsMethods()14 KeyListOpsMethods::KeyListOpsMethods()
15 : _keyList(&_nullKeyList),
16   _column(1),
17   _nullVal("."),
18   _delimStr(","),
19   _iter(_nullKeyList.begin()),
20   _nonNumErrFlag(false),
21   _isBam(false)
22 {
23 }
24 
KeyListOpsMethods(RecordKeyVector * keyList,int column)25 KeyListOpsMethods::KeyListOpsMethods(RecordKeyVector *keyList, int column)
26 : _keyList(keyList),
27   _column(column),
28   _nullVal("."),
29   _delimStr(","),
30   _iter(keyList->begin())
31 {
32 }
33 
34 
~KeyListOpsMethods()35 KeyListOpsMethods::~KeyListOpsMethods() {
36 
37 }
38 
39 // return the total of the values in the vector
getSum()40 double KeyListOpsMethods::getSum() {
41 	if (empty()) return NAN;
42 
43 	double theSum = 0.0;
44 	for (begin(); !end(); next()) {
45 		theSum += getColValNum();
46 	}
47 	return theSum;
48 }
49 
50 // return the average value in the vector
getMean()51 double KeyListOpsMethods::getMean() {
52 	if (empty()) return NAN;
53 
54 	return getSum() / (float)getCount();
55 }
56 
57 
58  // return the standard deviation
getStddev()59 double KeyListOpsMethods::getStddev() {
60 	if (empty()) return NAN;
61 
62 	double avg = getMean();
63 	double squareDiffSum = 0.0;
64 	for (begin(); !end(); next()) {
65 		double val = getColValNum();
66 		double diff = val - avg;
67 		squareDiffSum += diff * diff;
68 	}
69 	return sqrt(squareDiffSum / (float)getCount());
70 }
71 // return the standard deviation
getSampleStddev()72 double KeyListOpsMethods::getSampleStddev() {
73 	if (empty()) return NAN;
74 
75 	double avg = getMean();
76 	double squareDiffSum = 0.0;
77 	for (begin(); !end(); next()) {
78 		double val = getColValNum();
79 		double diff = val - avg;
80 		squareDiffSum += diff * diff;
81 	}
82 	return sqrt(squareDiffSum / ((float)getCount() - 1.0));
83 }
84 
85 // return the median value in the vector
getMedian()86 double KeyListOpsMethods::getMedian() {
87 	if (empty()) return NAN;
88 
89 	//get sorted vector. if even number of elems, return middle val.
90 	//if odd, average of two.
91 	toArray(true, ASC);
92 	size_t count = getCount();
93 	if (count % 2) {
94 		//odd number of elements. Take middle one.
95 		return _numArray[count/2];
96 	} else {
97 		//even numnber of elements. Take average of middle 2.
98 		double sum = _numArray[count/2 -1] + _numArray[count/2];
99 		return sum / 2.0;
100 	}
101 }
102 
103 // return the most common value in the vector
getMode()104 const string &KeyListOpsMethods::getMode() {
105 	if (empty()) return _nullVal;
106 
107 	makeFreqMap();
108 
109 	//now pass through the freq map and keep track of which key has the highest occurance.
110 	freqMapType::iterator maxIter = _freqMap.begin();
111 	int maxVal = 0;
112 	for (; _freqIter != _freqMap.end(); _freqIter++) {
113 		if (_freqIter->second > maxVal) {
114 			maxIter = _freqIter;
115 			maxVal = _freqIter->second;
116 		}
117 	}
118 	_retStr = maxIter->first;
119 	return _retStr;
120 }
121 // return the least common value in the vector
getAntiMode()122 const string &KeyListOpsMethods::getAntiMode() {
123 	if (empty()) return _nullVal;
124 
125 	makeFreqMap();
126 
127 	//now pass through the freq map and keep track of which key has the highest occurance.
128 	freqMapType::iterator minIter = _freqMap.begin();
129 	int minVal = INT_MAX;
130 	for (; _freqIter != _freqMap.end(); _freqIter++) {
131 		if (_freqIter->second < minVal) {
132 			minIter = _freqIter;
133 			minVal = _freqIter->second;
134 		}
135 	}
136 	_retStr =  minIter->first;
137 	return _retStr;
138 }
139 // return the minimum element of the vector
getMin()140 double KeyListOpsMethods::getMin() {
141 	if (empty()) return NAN;
142 
143 	begin();
144 	double minVal = getColValNum();
145 	for (; !end(); next()) {
146 		double currVal = getColValNum();
147 		minVal = (currVal < minVal) ? currVal : minVal;
148 	}
149 	return  minVal;
150 }
151 
152 // return the maximum element of the vector
getMax()153 double KeyListOpsMethods::getMax() {
154 	if (empty()) return NAN;
155 
156 	begin();
157 	double maxVal = getColValNum();
158 	for (; !end(); next()) {
159 		double currVal = getColValNum();
160 		maxVal = (currVal > maxVal) ? currVal : maxVal;
161 	}
162 	return maxVal;
163 }
164 
165 // return the minimum absolute value of the vector
getAbsMin()166 double KeyListOpsMethods::getAbsMin() {
167 	if (empty()) return NAN;
168 
169 	begin();
170 	double minVal = abs(getColValNum());
171 	for (; !end(); next()) {
172 		double currVal = abs(getColValNum());
173 		minVal = (currVal < minVal) ? currVal : minVal;
174 	}
175 	return minVal;
176 }
177 // return the maximum absolute value of the vector
getAbsMax()178 double KeyListOpsMethods::getAbsMax() {
179 	if (empty()) return NAN;
180 
181 	begin();
182 	double maxVal = abs(getColValNum());
183 	for (; !end(); next()) {
184 		double currVal = abs(getColValNum());
185 		maxVal = (currVal > maxVal) ? currVal : maxVal;
186 	}
187 	return maxVal;
188 }
189 // return the count of element in the vector
getCount()190 uint32_t KeyListOpsMethods::getCount() {
191 	return _keyList->size();
192 }
193 // return a delimited list of the unique elements
getDistinct()194 const string &KeyListOpsMethods::getDistinct() {
195 	if (empty()) return _nullVal;
196 	// separated list of unique values. If something repeats, only report once.
197 	makeFreqMap();
198 	_retStr.clear();
199 	for (; _freqIter != _freqMap.end(); _freqIter++) {
200 		if (_freqIter != _freqMap.begin()) _retStr += _delimStr;
201 		_retStr.append(_freqIter->first);
202 	}
203 	return _retStr;
204 }
205 
getDistinctOnly()206 const string &KeyListOpsMethods::getDistinctOnly() {
207 	if (empty()) return _nullVal;
208 	// separated list of unique values. If something repeats, don't report.
209 	makeFreqMap();
210 	_retStr.clear();
211 	for (; _freqIter != _freqMap.end(); _freqIter++) {
212 		if (_freqIter->second > 1) continue;
213 		if (_freqIter != _freqMap.begin()) _retStr += _delimStr;
214 		_retStr.append(_freqIter->first);
215 	}
216 	return _retStr;
217 }
218 
getDistinctSortNum(bool asc)219 const string &KeyListOpsMethods::getDistinctSortNum(bool asc) {
220 	if (empty()) return _nullVal;
221 
222 	toArray(true, asc ? ASC : DESC);
223 	vector<double>::iterator endIter = std::unique(_numArray.begin(), _numArray.end());
224 
225 	_retStr.clear();
226 	ostringstream s;
227 	for (vector<double>::iterator iter = _numArray.begin(); iter != endIter; iter++) {
228 		if (iter != _numArray.begin()) s << _delimStr;
229 		s << *iter;
230 	}
231 	_retStr.append(s.str());
232 	return  _retStr;
233 
234 }
235 
236 
237 // return a the count of _unique_ elements in the vector
getCountDistinct()238 uint32_t KeyListOpsMethods::getCountDistinct() {
239 	if (empty()) return 0;
240 
241 	makeFreqMap();
242 	return _freqMap.size();
243 }
244 
245 // return a delimiter-separated list of elements
getCollapse(const string & delimiter)246 const string &KeyListOpsMethods::getCollapse(const string &delimiter) {
247 	if (empty()) return _nullVal;
248 
249 	//just put all items in one big separated list.
250 	_retStr.clear();
251 	int i=0;
252 	for (begin(); !end(); next()) {
253 		if (i > 0) _retStr += _delimStr;
254 		_retStr.append(getColVal());
255 		i++;
256 	}
257 	return _retStr;
258 }
259 
260 // return a concatenation of all elements in the vector
getConcat()261 const string &KeyListOpsMethods::getConcat() {
262 	if (empty()) return _nullVal;
263 
264 	//like collapse but w/o commas. Just a true concat of all vals.
265 	//just swap out the delimChar with '' and call collapse, then
266 	//restore the delimChar.
267 	string oldDelimStr(_delimStr);
268 	_delimStr = "";
269 	getCollapse(); //this will store it's results in the _retStr method.
270 	_delimStr = oldDelimStr;
271 	return _retStr;
272 }
273 
274 // return a histogram of values and their freqs. in desc. order of frequency
getFreqDesc()275 const string &KeyListOpsMethods::getFreqDesc() {
276 	if (empty()) return _nullVal;
277 
278 	//for each uniq val, report # occurances, in desc order.
279 	makeFreqMap();
280 	//put freq map into multimap where key is the freq and val is the item. In other words, basically a reverse freq map.
281 	histDescType hist;
282 	for (; _freqIter != _freqMap.end(); _freqIter++) {
283 		hist.insert(pair<int, string>(_freqIter->second, _freqIter->first));
284 	}
285 	//now iterate through the reverse map we just made and output it's pairs in val:key format.
286 	_retStr.clear();
287 	ostringstream s;
288 	for (histDescType::iterator histIter = hist.begin(); histIter != hist.end(); histIter++) {
289 		if (histIter != hist.begin()) s << _delimStr;
290 		s << histIter->second;
291 		s << ":";
292 		s << histIter->first;
293 	}
294 	_retStr.append(s.str());
295 	return _retStr;
296 }
297 // return a histogram of values and their freqs. in asc. order of frequency
getFreqAsc()298 const string &KeyListOpsMethods::getFreqAsc() {
299 	if (empty()) return _nullVal;
300 
301 	//for each uniq val, report # occurances, in asc order.
302 	makeFreqMap();
303 	//put freq map into multimap where key is the freq and val is the item. In other words, basically a reverse freq map.
304 	histAscType hist;
305 	for (; _freqIter != _freqMap.end(); _freqIter++) {
306 		hist.insert(pair<int, string>(_freqIter->second, _freqIter->first));
307 //		hist[*(_freqIter->second)] = _freqIter->first;
308 	}
309 	//now iterate through the reverse map we just made and output it's pairs in val:key format.
310 	_retStr.clear();
311 	ostringstream s;
312 	for (histAscType::iterator histIter = hist.begin(); histIter != hist.end(); histIter++) {
313 		if (histIter != hist.begin()) s << _delimStr;
314 		s << histIter->second;
315 		s << ":";
316 		s << histIter->first;
317 	}
318 	_retStr.append(s.str());
319 	return _retStr;
320 }
321 // return the first value in the list
getFirst()322 const string &KeyListOpsMethods::getFirst() {
323 	if (empty()) return _nullVal;
324 
325 	//just the first item.
326 	begin();
327 	return getColVal();
328 }
329 // return the last value in the list
getLast()330 const string &KeyListOpsMethods::getLast() {
331 	if (empty()) return _nullVal;
332 
333 	//just the last item.
334 	begin();
335 	for (size_t i = 0; i < getCount() -1; i++) {
336 		next();
337 	}
338 	return getColVal();
339 }
340 
getColVal()341 const string &KeyListOpsMethods::getColVal() {
342 	const string &retVal = (*_iter)->getField(_column);
343 	if (_isBam && retVal.empty()) return _nullVal;
344 	return retVal;
345 }
346 
getColValNum()347 double KeyListOpsMethods::getColValNum() {
348 	const string &strVal = (*_iter)->getField(_column);
349 	if (!isNumeric(strVal)) {
350 		_nonNumErrFlag = true;
351 		ostringstream s;
352 		_errMsg = " ***** WARNING: Non numeric value ";
353 		s << strVal;
354 		s << " in ";
355 		s << _column;
356 		s << ".";
357 		_errMsg.append(s.str());
358 		return NAN;
359 	}
360 	return atof(strVal.c_str());
361 }
362 
toArray(bool useNum,SORT_TYPE sortVal)363 void KeyListOpsMethods::toArray(bool useNum, SORT_TYPE sortVal) {
364 
365 	//TBD: optimize performance with better memory management.
366 	if (useNum) {
367 		_numArray.resize(_keyList->size());
368 		int i=0;
369 		for (begin(); !end(); next()) {
370 			_numArray[i] = getColValNum();
371 			i++;
372 		}
373 	} else {
374 		_qsArray.resize(_keyList->size());
375 		int i=0;
376 		for (begin(); !end(); next()) {
377 			_qsArray[i] = getColVal();
378 			i++;
379 		}
380 	}
381 	if (sortVal != UNSORTED) {
382 		sortArray(useNum, sortVal == ASC);
383 	}
384 }
385 
sortArray(bool useNum,bool ascOrder)386 void KeyListOpsMethods::sortArray(bool useNum, bool ascOrder)
387 {
388 	if (useNum) {
389 		if (ascOrder) {
390 			sort(_numArray.begin(), _numArray.end(), less<double>());
391 		} else {
392 			sort(_numArray.begin(), _numArray.end(), greater<double>());
393 		}
394 	} else {
395 		if (ascOrder) {
396 			sort(_qsArray.begin(), _qsArray.end(), less<string>());
397 		} else {
398 			sort(_qsArray.begin(), _qsArray.end(), greater<string>());
399 		}
400 	}
401 }
402 
makeFreqMap()403 void KeyListOpsMethods::makeFreqMap() {
404 	_freqMap.clear();
405 
406 	//make a map of values to their number of times occuring.
407 	for (begin(); !end(); next()) {
408 		_freqMap[getColVal()]++;
409 	}
410 	_freqIter = _freqMap.begin();
411 }
412