1 /*
2  *  ccode.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 8/24/09.
6  *  Copyright 2009 Schloss Lab. All rights reserved.
7  *
8  */
9 
10 #include "ccode.h"
11 #include "ignoregaps.h"
12 #include "eachgapdist.h"
13 
14 //***************************************************************************************************************
Ccode(string filename,string temp,bool f,string mask,int win,int numW,string o)15 Ccode::Ccode(string filename, string temp, bool f, string mask, int win, int numW, string o) : MothurChimera() {
16  try {
17 
18 	fastafile = filename;
19 	outputDir = o;
20 	templateFileName = temp;  templateSeqs = readSeqs(temp);
21 	setMask(mask);
22 	filter = f;
23 	window = win;
24 	numWanted = numW;
25 
26      distCalc = new eachGapDist(1.0);
27 	decalc = new DeCalculator();
28 
29      Utils util;
30 	mapInfo = outputDir + util.getRootName(util.getSimpleName(fastafile)) + "mapinfo";
31 
32      ofstream out2;
33      util.openOutputFile(mapInfo, out2);
34 
35      out2 << "Place in masked, filtered and trimmed sequence\tPlace in original alignment" << endl;
36      out2.close();
37 
38 	}
39 	catch(exception& e) {
40 		m->errorOut(e, "Ccode", "Ccode");
41 		exit(1);
42 	}
43 }
44 //***************************************************************************************************************
~Ccode()45 Ccode::~Ccode() {
46 	delete distCalc;
47 	delete decalc;
48 }
49 //***************************************************************************************************************
print(ostream & out,ostream & outAcc)50 Sequence Ccode::print(ostream& out, ostream& outAcc) {
51 	try {
52 
53 		ofstream out2;
54 		Utils util; util.openOutputFileAppend(mapInfo, out2);
55 
56 		out2 << querySeq->getName() << endl;
57 		for (it = spotMap.begin(); it!= spotMap.end(); it++) {
58 			out2 << it->first << '\t' << it->second << endl;
59 		}
60 		out2.close();
61 		out << querySeq->getName() << endl << endl << "Reference sequences used and distance to query:" << endl;
62 
63 		for (int j = 0; j < closest.size(); j++) {
64 			out << closest[j].seq->getName() << '\t' << closest[j].dist << endl;
65 		}
66 		out << endl << endl;
67 
68 		//for each window
69 		//window mapping info.
70 		out << "Mapping information: ";
71 		//you mask and did not filter
72 		if ((seqMask != "") && (!filter)) { out << "mask and trim."; }
73 
74 		//you filtered and did not mask
75 		if ((seqMask == "") && (filter)) { out << "filter and trim."; }
76 
77 		//you masked and filtered
78 		if ((seqMask != "") && (filter)) { out << "mask, filter and trim."; }
79 
80 		out << endl << "Window\tStartPos\tEndPos" << endl;
81 		it = trim.begin();
82 		for (int k = 0; k < windows.size()-1; k++) {
83 			out << k+1 << '\t' << spotMap[windows[k]-it->first] << '\t' << spotMap[windows[k]-it->first+windowSizes] << endl;
84 		}
85 
86 		out << windows.size() << '\t' << spotMap[windows[windows.size()-1]-it->first] << '\t' << spotMap[it->second-it->first-1] << endl;
87 		out << endl;
88 		out << "Window\tAvgQ\t(sdQ)\tAvgR\t(sdR)\tRatio\tAnova" << endl;
89 		for (int k = 0; k < windows.size(); k++) {
90 			float ds = averageQuery[k] / averageRef[k];
91 			out << k+1 << '\t' << averageQuery[k] << '\t' << sdQuery[k] << '\t' << averageRef[k] << '\t'<< sdRef[k] << '\t' << ds << '\t' << anova[k] << endl;
92 		}
93 		out << endl;
94 
95 		//varRef
96 		//varQuery
97 		/* F test for differences among variances.
98 		* varQuery is expected to be higher or similar than varRef */
99 		//float fs = varQuery[query] / varRef[query];	/* F-Snedecor, test for differences of variances */
100 
101 		bool results = false;
102 
103 		//confidence limit, t - Student, anova
104 		out << "Window\tConfidenceLimit\tt-Student\tAnova" << endl;
105 
106 		for (int k = 0; k < windows.size(); k++) {
107 			string temp = "";
108 			if (isChimericConfidence[k]) {  temp += "*\t"; }
109 			else { temp += "\t"; }
110 
111 			if (isChimericTStudent[k]) {  temp += "*\t"; }
112 			else { temp += "\t"; }
113 
114 			if (isChimericANOVA[k]) {  temp += "*\t"; }
115 			else { temp += "\t"; }
116 
117 			out << k+1 << '\t' << temp << endl;
118 
119 			if (temp == "*\t*\t*\t") {  results = true;  }
120 		}
121 		out << endl;
122 
123 		if (results) {
124 			m->mothurOut(querySeq->getName() + " was found have at least one chimeric window.\n");
125 			outAcc << querySeq->getName() << endl;
126 		}
127 
128 		//free memory
129 		for (int i = 0; i < closest.size(); i++) {  delete closest[i].seq;  }
130 
131 		return *querySeq;
132 	}
133 	catch(exception& e) {
134 		m->errorOut(e, "Ccode", "print");
135 		exit(1);
136 	}
137 }
138 //***************************************************************************************************************
getChimeras(Sequence * query)139 int Ccode::getChimeras(Sequence* query) {
140 	try {
141 
142 		closest.clear();
143 		refCombo = 0;
144 		sumRef.clear();
145 		varRef.clear();
146 		varQuery.clear();
147 		sdRef.clear();
148 		sdQuery.clear();
149 		sumQuery.clear();
150 		sumSquaredRef.clear();
151 		sumSquaredQuery.clear();
152 		averageRef.clear();
153 		averageQuery.clear();
154 		anova.clear();
155 		isChimericConfidence.clear();
156 		isChimericTStudent.clear();
157 		isChimericANOVA.clear();
158 		trim.clear();
159 		spotMap.clear();
160 		windowSizes = window;
161 		windows.clear();
162 
163 
164 		querySeq = query;
165 
166 		//find closest matches to query
167 		closest = findClosest(query, numWanted);
168 
169 		if (m->getControl_pressed()) {  return 0;  }
170 
171 		//initialize spotMap
172 		for (int i = 0; i < query->getAligned().length(); i++) {	spotMap[i] = i;		}
173 
174 		//mask sequences if the user wants to
175 		if (seqMask != "") {
176 			decalc->setMask(seqMask);
177 
178 			decalc->runMask(query);
179 
180 			//mask closest
181 			for (int i = 0; i < closest.size(); i++) {	decalc->runMask(closest[i].seq);	}
182 
183 			spotMap = decalc->getMaskMap();
184 		}
185 
186 		if (filter) {
187 			vector<Sequence*> temp;
188 			for (int i = 0; i < closest.size(); i++) { temp.push_back(closest[i].seq);  }
189 			temp.push_back(query);
190 
191 			createFilter(temp, 0.5);
192 
193 			for (int i = 0; i < temp.size(); i++) {
194 				if (m->getControl_pressed()) {  return 0;  }
195 				runFilter(temp[i]);
196 			}
197 
198 			//update spotMap
199 			map<int, int> newMap;
200 			int spot = 0;
201 
202 			for (int i = 0; i < filterString.length(); i++) {
203 				if (filterString[i] == '1') {
204 					//add to newMap
205 					newMap[spot] = spotMap[i];
206 					spot++;
207 				}
208 			}
209 			spotMap = newMap;
210 		}
211 
212 		//trim sequences - this follows ccodes remove_extra_gaps
213 		trimSequences(query);
214 		if (m->getControl_pressed()) {  return 0;  }
215 
216 		//windows are equivalent to words - ccode paper recommends windows are between 5% and 20% on alignment length().
217 		//Our default will be 10% and we will warn if user tries to use a window above or below these recommendations
218 		windows = findWindows();
219 		if (m->getControl_pressed()) {  return 0;  }
220 
221 		//remove sequences that are more than 20% different and less than 0.5% different - may want to allow user to specify this later
222 		removeBadReferenceSeqs(closest);
223 		if (m->getControl_pressed()) {  return 0;  }
224 
225 		//find the averages for each querys references
226 		getAverageRef(closest);  //fills sumRef, averageRef, sumSquaredRef and refCombo.
227 		getAverageQuery(closest, query);  //fills sumQuery, averageQuery, sumSquaredQuery.
228 		if (m->getControl_pressed()) {  return 0;  }
229 
230 		//find the averages for each querys references
231 		findVarianceRef();  //fills varRef and sdRef also sets minimum error rate to 0.001 to avoid divide by 0.
232 		if (m->getControl_pressed()) {  return 0;  }
233 
234 		//find the averages for the query
235 		findVarianceQuery();  //fills varQuery and sdQuery also sets minimum error rate to 0.001 to avoid divide by 0.
236 		if (m->getControl_pressed()) {  return 0;  }
237 
238 		determineChimeras();  //fills anova, isChimericConfidence, isChimericTStudent and isChimericANOVA.
239 		if (m->getControl_pressed()) {  return 0;  }
240 
241 		return 0;
242 	}
243 	catch(exception& e) {
244 		m->errorOut(e, "Ccode", "getChimeras");
245 		exit(1);
246 	}
247 }
248 /***************************************************************************************************************/
249 //ccode algo says it does this to "Removes the initial and final gaps to avoid biases due to incomplete sequences."
trimSequences(Sequence * query)250 void Ccode::trimSequences(Sequence* query) {
251 	try {
252 
253 		int frontPos = 0;  //should contain first position in all seqs that is not a gap character
254 		int rearPos = query->getAligned().length();
255 
256 		//********find first position in closest seqs that is a non gap character***********//
257 		//find first position all query seqs that is a non gap character
258 		for (int i = 0; i < closest.size(); i++) {
259 
260 			string aligned = closest[i].seq->getAligned();
261 			int pos = 0;
262 
263 			//find first spot in this seq
264 			for (int j = 0; j < aligned.length(); j++) {
265 				if (isalpha(aligned[j])) {
266 					pos = j;
267 					break;
268 				}
269 			}
270 
271 			//save this spot if it is the farthest
272 			if (pos > frontPos) { frontPos = pos; }
273 		}
274 
275 		//find first position all querySeq[query] that is a non gap character
276 		string aligned = query->getAligned();
277 		int pos = 0;
278 
279 		//find first spot in this seq
280 		for (int j = 0; j < aligned.length(); j++) {
281 			if (isalpha(aligned[j])) {
282 				pos = j;
283 				break;
284 			}
285 		}
286 
287 		//save this spot if it is the farthest
288 		if (pos > frontPos) { frontPos = pos; }
289 
290 
291 		//********find last position in closest seqs that is a non gap character***********//
292 		for (int i = 0; i < closest.size(); i++) {
293 
294 			string aligned = closest[i].seq->getAligned();
295 			int pos = aligned.length();
296 
297 			//find first spot in this seq
298 			for (int j = aligned.length()-1; j >= 0; j--) {
299 				if (isalpha(aligned[j])) {
300 					pos = j;
301 					break;
302 				}
303 			}
304 
305 			//save this spot if it is the farthest
306 			if (pos < rearPos) { rearPos = pos; }
307 		}
308 
309 		//find last position all querySeqs[query] that is a non gap character
310 		aligned = query->getAligned();
311 		pos = aligned.length();
312 
313 		//find first spot in this seq
314 		for (int j = aligned.length()-1; j >= 0; j--) {
315 			if (isalpha(aligned[j])) {
316 				pos = j;
317 				break;
318 			}
319 		}
320 
321 		//save this spot if it is the farthest
322 		if (pos < rearPos) { rearPos = pos; }
323 
324 
325 		//check to make sure that is not whole seq
326 		if ((rearPos - frontPos - 1) <= 0) {  m->mothurOut("Error, when I trim your sequences, the entire sequence is trimmed.\n");  exit(1);  }
327 
328 		map<int, int> tempTrim;
329 		tempTrim[frontPos] = rearPos;
330 
331 		//save trimmed locations
332 		trim = tempTrim;
333 
334 		//update spotMask
335 		map<int, int> newMap;
336 		int spot = 0;
337 
338 		for (int i = frontPos; i < rearPos; i++) {
339 			//add to newMap
340 			newMap[spot] = spotMap[i];
341 			spot++;
342 		}
343 		spotMap = newMap;
344 	}
345 	catch(exception& e) {
346 		m->errorOut(e, "Ccode", "trimSequences");
347 		exit(1);
348 	}
349 }
350 /***************************************************************************************************************/
findWindows()351 vector<int> Ccode::findWindows() {
352 	try {
353 
354 		vector<int> win;
355 		it = trim.begin();
356 
357 		int length = it->second - it->first;
358 
359 		//default is wanted = 10% of total length
360 		if (windowSizes > length) {
361 			m->mothurOut("You have slected a window larger than your sequence length after all filters, masks and trims have been done. I will use the default 10% of sequence length.");
362 			windowSizes = length / 10;
363 		}else if (windowSizes == 0) { windowSizes = length / 10;  }
364 		else if (windowSizes > (length * 0.20)) {
365 			m->mothurOut("You have selected a window that is larger than 20% of your sequence length.  This is not recommended, but I will continue anyway.\n");
366 		}else if (windowSizes < (length * 0.05)) {
367 			m->mothurOut("You have selected a window that is smaller than 5% of your sequence length.  This is not recommended, but I will continue anyway.\n");
368 		}
369 
370 		//save starting points of each window
371 		for (int m = it->first;  m < (it->second-windowSizes); m+=windowSizes) {  win.push_back(m);  }
372 
373 		//save last window
374 		if (win[win.size()-1] < (it->first+length)) {
375 			win.push_back(win[win.size()-1]+windowSizes); // ex. string length is 115, window is 25, without this you would get 0, 25, 50, 75
376 		}																									//with this you would get 1,25,50,75,100
377 
378 		return win;
379 	}
380 	catch(exception& e) {
381 		m->errorOut(e, "Ccode", "findWindows");
382 		exit(1);
383 	}
384 }
385 //***************************************************************************************************************
getDiff(string seqA,string seqB)386 int Ccode::getDiff(string seqA, string seqB) {
387 	try {
388 
389 		int numDiff = 0;
390 
391 		for (int i = 0; i < seqA.length(); i++) {
392 			//if you are both not gaps
393 			//if (isalpha(seqA[i]) && isalpha(seqA[i])) {
394 				//are you different
395 				if (seqA[i] != seqB[i]) {
396 					 int ok; /* ok=1 means equivalent base. Checks for degenerate bases */
397 
398 					/* the char in base_a and base_b have been checked and they are different */
399 					if ((seqA[i] == 'N') && (seqB[i] != '-')) ok = 1;
400 					else if ((seqB[i] == 'N') && (seqA[i] != '-')) ok = 1;
401 					else if ((seqA[i] == 'Y') && ((seqB[i] == 'C') || (seqB[i] == 'T'))) ok = 1;
402 					else if ((seqB[i] == 'Y') && ((seqA[i] == 'C') || (seqA[i] == 'T'))) ok = 1;
403 					else if ((seqA[i] == 'R') && ((seqB[i] == 'G') || (seqB[i] == 'A'))) ok = 1;
404 					else if ((seqB[i] == 'R') && ((seqA[i] == 'G') || (seqA[i] == 'A'))) ok = 1;
405 					else if ((seqA[i] == 'S') && ((seqB[i] == 'C') || (seqB[i] == 'G'))) ok = 1;
406 					else if ((seqB[i] == 'S') && ((seqA[i] == 'C') || (seqA[i] == 'G'))) ok = 1;
407 					else if ((seqA[i] == 'W') && ((seqB[i] == 'T') || (seqB[i] == 'A'))) ok = 1;
408 					else if ((seqB[i] == 'W') && ((seqA[i] == 'T') || (seqA[i] == 'A'))) ok = 1;
409 					else if ((seqA[i] == 'M') && ((seqB[i] == 'A') || (seqB[i] == 'C'))) ok = 1;
410 					else if ((seqB[i] == 'M') && ((seqA[i] == 'A') || (seqA[i] == 'C'))) ok = 1;
411 					else if ((seqA[i] == 'K') && ((seqB[i] == 'T') || (seqB[i] == 'G'))) ok = 1;
412 					else if ((seqB[i] == 'K') && ((seqA[i] == 'T') || (seqA[i] == 'G'))) ok = 1;
413 					else if ((seqA[i] == 'V') && ((seqB[i] == 'C') || (seqB[i] == 'A') || (seqB[i] == 'G'))) ok = 1;
414 					else if ((seqB[i] == 'V') && ((seqA[i] == 'C') || (seqA[i] == 'A') || (seqA[i] == 'G'))) ok = 1;
415 					else if ((seqA[i] == 'H') && ((seqB[i] == 'T') || (seqB[i] == 'A') || (seqB[i] == 'C'))) ok = 1;
416 					else if ((seqB[i] == 'H') && ((seqA[i] == 'T') || (seqA[i] == 'A') || (seqA[i] == 'C'))) ok = 1;
417 					else if ((seqA[i] == 'D') && ((seqB[i] == 'T') || (seqB[i] == 'A') || (seqB[i] == 'G'))) ok = 1;
418 					else if ((seqB[i] == 'D') && ((seqA[i] == 'T') || (seqA[i] == 'A') || (seqA[i] == 'G'))) ok = 1;
419 					else if ((seqA[i] == 'B') && ((seqB[i] == 'C') || (seqB[i] == 'T') || (seqB[i] == 'G'))) ok = 1;
420 					else if ((seqB[i] == 'B') && ((seqA[i] == 'C') || (seqA[i] == 'T') || (seqA[i] == 'G'))) ok = 1;
421 					else ok = 0;  /* the bases are different and not equivalent */
422 
423 					//check if they are both blanks
424 					if ((seqA[i] == '.') && (seqB[i] == '-')) ok = 1;
425 					else if ((seqB[i] == '.') && (seqA[i] == '-')) ok = 1;
426 
427 					if (ok == 0) {  numDiff++;  }
428 				}
429 			//}
430 		}
431 
432 		return numDiff;
433 
434 	}
435 	catch(exception& e) {
436 		m->errorOut(e, "Ccode", "getDiff");
437 		exit(1);
438 	}
439 }
440 //***************************************************************************************************************
441 //tried to make this look most like ccode original implementation
removeBadReferenceSeqs(vector<SeqDist> & seqs)442 void Ccode::removeBadReferenceSeqs(vector<SeqDist>& seqs) {
443 	try {
444 
445 		vector< vector<int> > numDiffBases;
446 		numDiffBases.resize(seqs.size());
447 		//initialize to 0
448 		for (int i = 0; i < numDiffBases.size(); i++) { numDiffBases[i].resize(seqs.size(),0); }
449 
450 		it = trim.begin();
451 		int length = it->second - it->first;
452 
453 		//calc differences from each sequence to everyother seq in the set
454 		for (int i = 0; i < seqs.size(); i++) {
455 
456 			string seqA = seqs[i].seq->getAligned().substr(it->first, length);
457 
458 			//so you don't calc i to j and j to i since they are the same
459 			for (int j = 0; j < i; j++) {
460 
461 				string seqB = seqs[j].seq->getAligned().substr(it->first, length);
462 
463 				//compare strings
464 				int numDiff = getDiff(seqA, seqB);
465 
466 				numDiffBases[i][j] = numDiff;
467 				numDiffBases[j][i] = numDiff;
468 			}
469 		}
470 
471 		//initailize remove to 0
472 		vector<int> remove;  remove.resize(seqs.size(), 0);
473 		float top = ((20*length) / (float) 100);
474 		float bottom = ((0.5*length) / (float) 100);
475 
476 		//check each numDiffBases and if any are higher than threshold set remove to 1 so you can remove those seqs from the closest set
477 		for (int i = 0; i < numDiffBases.size(); i++) {
478 			for (int j = 0; j < i; j++) {
479 				//are you more than 20% different
480 				if (numDiffBases[i][j] > top)		{  remove[j] = 1;  }
481 				//are you less than 0.5% different
482 				if (numDiffBases[i][j] < bottom)	{  remove[j] = 1;  }
483 			}
484 		}
485 
486 		int numSeqsLeft = 0;
487 
488 		//count seqs that are not going to be removed
489 		for (int i = 0; i < remove.size(); i++) {
490 			if (remove[i] == 0)  { numSeqsLeft++;  }
491 		}
492 
493 		//if you have enough then remove bad ones
494 		if (numSeqsLeft >= 3) {
495 			vector<SeqDist> goodSeqs;
496 			//remove bad seqs
497 			for (int i = 0; i < remove.size(); i++) {
498 				if (remove[i] == 0) {
499 					goodSeqs.push_back(seqs[i]);
500 				}
501 			}
502 
503 			seqs = goodSeqs;
504 
505 		}else { //warn, but dont remove any
506 			m->mothurOut(querySeq->getName() + " does not have an adaquate number of reference sequences that are within 20% and 0.5% similarity.  I will continue, but please check.\n");
507 		}
508 
509 	}
510 	catch(exception& e) {
511 		m->errorOut(e, "Ccode", "removeBadReferenceSeqs");
512 		exit(1);
513 	}
514 }
515 //***************************************************************************************************************
516 //makes copy of templateseq for filter
findClosest(Sequence * q,int numWanted)517 vector<SeqDist>  Ccode::findClosest(Sequence* q, int numWanted) {
518 	try{
519 
520 		vector<SeqDist>  topMatches;
521 
522 		Sequence query = *(q);
523 
524 		//calc distance to each sequence in template seqs
525 		for (int i = 0; i < templateSeqs.size(); i++) {
526 
527 			Sequence ref = *(templateSeqs[i]);
528 
529 			//find overall dist
530 			double dist = distCalc->calcDist(query, ref);
531 
532 			//save distance
533 			SeqDist temp;
534 			temp.seq = new Sequence(templateSeqs[i]->getName(), templateSeqs[i]->getAligned());
535 			temp.dist = dist;
536 
537 			topMatches.push_back(temp);
538 		}
539 
540 		sort(topMatches.begin(), topMatches.end(), compareSeqDist);
541 
542 		for (int i = numWanted; i < topMatches.size(); i++) {  delete topMatches[i].seq;  }
543 
544 		topMatches.resize(numWanted);
545 
546 		return topMatches;
547 
548 	}
549 	catch(exception& e) {
550 		m->errorOut(e, "Ccode", "findClosestSides");
551 		exit(1);
552 	}
553 }
554 /**************************************************************************************************/
555 //find the distances from each reference sequence to every other reference sequence for each window for this query
getAverageRef(vector<SeqDist> ref)556 void Ccode::getAverageRef(vector<SeqDist> ref) {
557 	try {
558 
559 		vector< vector< vector<int> > >  diffs;  //diffs[0][1][2] is the number of differences between ref seq 0 and ref seq 1 at window 2.
560 
561 		//initialize diffs vector
562 		diffs.resize(ref.size());
563 		for (int i = 0; i < diffs.size(); i++) {
564 			diffs[i].resize(ref.size());
565 			for (int j = 0; j < diffs[i].size(); j++) {
566 				diffs[i][j].resize(windows.size(), 0);
567 			}
568 		}
569 
570 		it = trim.begin();
571 
572 		//find the distances from each reference sequence to every other reference sequence for each window for this query
573 		for (int i = 0; i < ref.size(); i++) {
574 
575 			string refI = ref[i].seq->getAligned();
576 
577 			//j<i, so you don't find distances from i to j and then j to i.
578 			for (int j = 0; j < i; j++) {
579 
580 				string refJ = ref[j].seq->getAligned();
581 
582 				for (int k = 0; k < windows.size(); k++) {
583 
584 					string refIWindowk, refJWindowk;
585 
586 					if (k < windows.size()-1) {
587 						//get window strings
588 						refIWindowk = refI.substr(windows[k], windowSizes);
589 						refJWindowk = refJ.substr(windows[k], windowSizes);
590 					}else { //last window may be smaller than rest - see findwindows
591 						//get window strings
592 						refIWindowk = refI.substr(windows[k], (it->second-windows[k]));
593 						refJWindowk = refJ.substr(windows[k], (it->second-windows[k]));
594 					}
595 
596 					//find differences
597 					int diff = getDiff(refIWindowk, refJWindowk);
598 
599 					//save differences in [i][j][k] and [j][i][k] since they are the same
600 					diffs[i][j][k] = diff;
601 					diffs[j][i][k] = diff;
602 
603 				}//k
604 
605 			}//j
606 
607 		}//i
608 
609 		//initialize sumRef for this query
610 		sumRef.resize(windows.size(), 0);
611 		sumSquaredRef.resize(windows.size(), 0);
612 		averageRef.resize(windows.size(), 0);
613 
614 		//find the sum of the differences for hte reference sequences
615 		for (int i = 0; i < diffs.size(); i++) {
616 			for (int j = 0; j < i; j++) {
617 
618 				//increment this querys reference sequences combos
619 				refCombo++;
620 
621 				for (int k = 0; k < diffs[i][j].size(); k++) {
622 					sumRef[k] += diffs[i][j][k];
623 					sumSquaredRef[k] += (diffs[i][j][k]*diffs[i][j][k]);
624 				}//k
625 
626 			}//j
627 		}//i
628 
629 
630 		//find the average of the differences for the references for each window
631 		for (int i = 0; i < windows.size(); i++) {
632 			averageRef[i] = sumRef[i] / (float) refCombo;
633 		}
634 
635 	}
636 	catch(exception& e) {
637 		m->errorOut(e, "Ccode", "getAverageRef");
638 		exit(1);
639 	}
640 }
641 /**************************************************************************************************/
getAverageQuery(vector<SeqDist> ref,Sequence * query)642 void Ccode::getAverageQuery (vector<SeqDist> ref, Sequence* query) {
643 	try {
644 
645 		vector< vector<int> >  diffs;  //diffs[1][2] is the number of differences between querySeqs[query] and ref seq 1 at window 2.
646 
647 		//initialize diffs vector
648 		diffs.resize(ref.size());
649 		for (int j = 0; j < diffs.size(); j++) {
650 			diffs[j].resize(windows.size(), 0);
651 		}
652 
653 		it = trim.begin();
654 
655 		string refQuery = query->getAligned();
656 
657 		//j<i, so you don't find distances from i to j and then j to i.
658 		for (int j = 0; j < ref.size(); j++) {
659 
660 			 string refJ = ref[j].seq->getAligned();
661 
662 			 for (int k = 0; k < windows.size(); k++) {
663 
664 					string QueryWindowk, refJWindowk;
665 
666 					if (k < windows.size()-1) {
667 						//get window strings
668 						QueryWindowk = refQuery.substr(windows[k], windowSizes);
669 						refJWindowk = refJ.substr(windows[k], windowSizes);
670 					}else { //last window may be smaller than rest - see findwindows
671 						//get window strings
672 						QueryWindowk = refQuery.substr(windows[k], (it->second-windows[k]));
673 						refJWindowk = refJ.substr(windows[k], (it->second-windows[k]));
674 					}
675 
676 					//find differences
677 					int diff = getDiff(QueryWindowk, refJWindowk);
678 
679 					//save differences
680 					diffs[j][k] = diff;
681 
682 			 }//k
683 		}//j
684 
685 
686 		//initialize sumRef for this query
687 		sumQuery.resize(windows.size(), 0);
688 		sumSquaredQuery.resize(windows.size(), 0);
689 		averageQuery.resize(windows.size(), 0);
690 
691 		//find the sum of the differences
692 		for (int j = 0; j < diffs.size(); j++) {
693 			for (int k = 0; k < diffs[j].size(); k++) {
694 				sumQuery[k] += diffs[j][k];
695 				sumSquaredQuery[k] += (diffs[j][k]*diffs[j][k]);
696 			}//k
697 		}//j
698 
699 
700 		//find the average of the differences for the references for each window
701 		for (int i = 0; i < windows.size(); i++) {
702 			averageQuery[i] = sumQuery[i] / (float) ref.size();
703 		}
704 	}
705 	catch(exception& e) {
706 		m->errorOut(e, "Ccode", "getAverageQuery");
707 		exit(1);
708 	}
709 }
710 /**************************************************************************************************/
findVarianceRef()711 void Ccode::findVarianceRef() {
712 	try {
713 
714 		varRef.resize(windows.size(), 0);
715 		sdRef.resize(windows.size(), 0);
716 
717 		//for each window
718 		for (int i = 0; i < windows.size(); i++) {
719 			varRef[i] = (sumSquaredRef[i] - ((sumRef[i]*sumRef[i])/(float)refCombo)) / (float)(refCombo-1);
720 			sdRef[i] = sqrt(varRef[i]);
721 
722 			//set minimum error rate to 0.001 - to avoid potential divide by zero - not sure if this is necessary but it follows ccode implementation
723 			if (averageRef[i] < 0.001)			{	averageRef[i] = 0.001;		}
724 			if (sumRef[i] < 0.001)				{	sumRef[i] = 0.001;			}
725 			if (varRef[i] < 0.001)				{	varRef[i] = 0.001;			}
726 			if (sumSquaredRef[i] < 0.001)		{	sumSquaredRef[i] = 0.001;	}
727 			if (sdRef[i] < 0.001)				{	sdRef[i] = 0.001;			}
728 
729 		}
730 	}
731 	catch(exception& e) {
732 		m->errorOut(e, "Ccode", "findVarianceRef");
733 		exit(1);
734 	}
735 }
736 /**************************************************************************************************/
findVarianceQuery()737 void Ccode::findVarianceQuery() {
738 	try {
739 		varQuery.resize(windows.size(), 0);
740 		sdQuery.resize(windows.size(), 0);
741 
742 		//for each window
743 		for (int i = 0; i < windows.size(); i++) {
744 			varQuery[i] = (sumSquaredQuery[i] - ((sumQuery[i]*sumQuery[i])/(float) closest.size())) / (float) (closest.size()-1);
745 			sdQuery[i] = sqrt(varQuery[i]);
746 
747 			//set minimum error rate to 0.001 - to avoid potential divide by zero - not sure if this is necessary but it follows ccode implementation
748 			if (averageQuery[i] < 0.001)			{	averageQuery[i] = 0.001;		}
749 			if (sumQuery[i] < 0.001)				{	sumQuery[i] = 0.001;			}
750 			if (varQuery[i] < 0.001)				{	varQuery[i] = 0.001;			}
751 			if (sumSquaredQuery[i] < 0.001)		{	sumSquaredQuery[i] = 0.001;	}
752 			if (sdQuery[i] < 0.001)				{	sdQuery[i] = 0.001;			}
753 		}
754 
755 	}
756 	catch(exception& e) {
757 		m->errorOut(e, "Ccode", "findVarianceQuery");
758 		exit(1);
759 	}
760 }
761 /**************************************************************************************************/
determineChimeras()762 void Ccode::determineChimeras() {
763 	try {
764 
765 		isChimericConfidence.resize(windows.size(), false);
766 		isChimericTStudent.resize(windows.size(), false);
767 		isChimericANOVA.resize(windows.size(), false);
768 		anova.resize(windows.size());
769 
770 
771 		//for each window
772 		for (int i = 0; i < windows.size(); i++) {
773 
774 			//get confidence limits
775 			float t = getT(closest.size()-1);  //how many seqs you are comparing to this querySeq
776 			float dsUpper = (averageQuery[i] + (t * sdQuery[i])) / averageRef[i];
777 			float dsLower = (averageQuery[i] - (t * sdQuery[i])) / averageRef[i];
778 
779 			if ((dsUpper > 1.0) && (dsLower > 1.0) && (averageQuery[i] > averageRef[i])) {  /* range does not include 1 */
780 					isChimericConfidence[i] = true;   /* significantly higher at P<0.05 */
781 
782 			}
783 
784 			//student t test
785 			int degreeOfFreedom = refCombo + closest.size() - 2;
786 			float denomForT = (((refCombo-1) * varQuery[i] + (closest.size() - 1) * varRef[i]) / (float) degreeOfFreedom) * ((refCombo + closest.size()) / (float) (refCombo * closest.size()));	/* denominator, without sqrt(), for ts calculations */
787 
788 			float ts = fabs((averageQuery[i] - averageRef[i]) / (sqrt(denomForT)));  /* value of ts for t-student test */
789 			t = getT(degreeOfFreedom);
790 
791 			if ((ts >= t) && (averageQuery[i] > averageRef[i])) {
792 				isChimericTStudent[i] = true;   /* significantly higher at P<0.05 */
793 			}
794 
795 			//anova test
796 			float value1 = sumQuery[i] + sumRef[i];
797 			float value2 = sumSquaredQuery[i] + sumSquaredRef[i];
798 			float value3 = ((sumQuery[i]*sumQuery[i]) / (float) (closest.size())) + ((sumRef[i] * sumRef[i]) / (float) refCombo);
799 			float value4 = (value1 * value1) / ( (float) (closest.size() + refCombo) );
800 			float value5 = value2 - value4;
801 			float value6 = value3 - value4;
802 			float value7 = value5 - value6;
803 			float value8 = value7 / ((float) degreeOfFreedom);
804 			float anovaValue = value6 / value8;
805 
806 			float f = getF(degreeOfFreedom);
807 
808 			 if ((anovaValue >= f) && (averageQuery[i] > averageRef[i]))  {
809 	               isChimericANOVA[i] = true;   /* significant P<0.05 */
810 	        }
811 
812 			if (isnan(anovaValue) || isinf(anovaValue)) { anovaValue = 0.0; }
813 
814 			anova[i] = anovaValue;
815 		}
816 
817 	}
818 	catch(exception& e) {
819 		m->errorOut(e, "Ccode", "determineChimeras");
820 		exit(1);
821 	}
822 }
823 /**************************************************************************************************/
getT(int numseq)824 float Ccode::getT(int numseq) {
825 	try {
826 
827 		float tvalue = 0;
828 
829 		/* t-student critical values for different degrees of freedom and alpha 0.1 in one-tail tests (equivalent to 0.05) */
830 		if (numseq > 120) tvalue = 1.645;
831 		else if (numseq > 60) tvalue = 1.658;
832         else if (numseq > 40) tvalue = 1.671;
833         else if (numseq > 30) tvalue = 1.684;
834         else if (numseq > 29) tvalue = 1.697;
835         else if (numseq > 28) tvalue = 1.699;
836         else if (numseq > 27) tvalue = 1.701;
837         else if (numseq > 26) tvalue = 1.703;
838         else if (numseq > 25) tvalue = 1.706;
839         else if (numseq > 24) tvalue = 1.708;
840         else if (numseq > 23) tvalue = 1.711;
841         else if (numseq > 22) tvalue = 1.714;
842         else if (numseq > 21) tvalue = 1.717;
843         else if (numseq > 20) tvalue = 1.721;
844         else if (numseq > 19) tvalue = 1.725;
845         else if (numseq > 18) tvalue = 1.729;
846         else if (numseq > 17) tvalue = 1.734;
847         else if (numseq > 16) tvalue = 1.740;
848         else if (numseq > 15) tvalue = 1.746;
849         else if (numseq > 14) tvalue = 1.753;
850         else if (numseq > 13) tvalue = 1.761;
851         else if (numseq > 12) tvalue = 1.771;
852         else if (numseq > 11) tvalue = 1.782;
853         else if (numseq > 10) tvalue = 1.796;
854         else if (numseq > 9) tvalue = 1.812;
855         else if (numseq > 8) tvalue = 1.833;
856         else if (numseq > 7) tvalue = 1.860;
857         else if (numseq > 6) tvalue = 1.895;
858         else if (numseq > 5) tvalue = 1.943;
859         else if (numseq > 4) tvalue = 2.015;
860         else if (numseq > 3) tvalue = 2.132;
861         else if (numseq > 2) tvalue = 2.353;
862         else if (numseq > 1) tvalue = 2.920;
863 		else if (numseq <= 1) {
864 			m->mothurOut("Two or more reference sequences are required, your data will be flawed.\n\n");
865 		}
866 
867 		return tvalue;
868 	}
869 	catch(exception& e) {
870 		m->errorOut(e, "Ccode", "getT");
871 		exit(1);
872 	}
873 }
874 /**************************************************************************************************/
getF(int numseq)875 float Ccode::getF(int numseq) {
876 	try {
877 
878 		float fvalue = 0;
879 
880 		 /* F-Snedecor critical values for v1=1 and different degrees of freedom v2 and alpha 0.05 */
881         if (numseq > 120) fvalue = 3.84;
882         else if (numseq > 60) fvalue = 3.92;
883         else if (numseq > 40) fvalue = 4.00;
884         else if (numseq > 30) fvalue = 4.08;
885         else if (numseq > 29) fvalue = 4.17;
886         else if (numseq > 28) fvalue = 4.18;
887         else if (numseq > 27) fvalue = 4.20;
888         else if (numseq > 26) fvalue = 4.21;
889         else if (numseq > 25) fvalue = 4.23;
890         else if (numseq > 24) fvalue = 4.24;
891         else if (numseq > 23) fvalue = 4.26;
892         else if (numseq > 22) fvalue = 4.28;
893         else if (numseq > 21) fvalue = 4.30;
894         else if (numseq > 20) fvalue = 4.32;
895         else if (numseq > 19) fvalue = 4.35;
896         else if (numseq > 18) fvalue = 4.38;
897         else if (numseq > 17) fvalue = 4.41;
898         else if (numseq > 16) fvalue = 4.45;
899         else if (numseq > 15) fvalue = 4.49;
900         else if (numseq > 14) fvalue = 4.54;
901         else if (numseq > 13) fvalue = 4.60;
902         else if (numseq > 12) fvalue = 4.67;
903         else if (numseq > 11) fvalue = 4.75;
904         else if (numseq > 10) fvalue = 4.84;
905         else if (numseq > 9) fvalue = 4.96;
906         else if (numseq > 8) fvalue = 5.12;
907         else if (numseq > 7) fvalue = 5.32;
908         else if (numseq > 6) fvalue = 5.59;
909         else if (numseq > 5) fvalue = 5.99;
910         else if (numseq > 4) fvalue = 6.61;
911         else if (numseq > 3) fvalue = 7.71;
912         else if (numseq > 2) fvalue = 10.1;
913         else if (numseq > 1) fvalue = 18.5;
914         else if (numseq > 0) fvalue = 161;
915 		else if (numseq <= 0) {
916 			m->mothurOut("Two or more reference sequences are required, your data will be flawed.\n\n");
917         }
918 
919 		return fvalue;
920 	}
921 	catch(exception& e) {
922 		m->errorOut(e, "Ccode", "getF");
923 		exit(1);
924 	}
925 }
926 //***************************************************************************************************************
927 
928 
929 
930