1 /*
2 * ccode.cpp
3 * Mothur
4 *
5 * Created by westcott on 8/24/09.
6 * Copyright 2009 Schloss Lab. All rights reserved.
7 *
8 */
9
10 #include "ccode.h"
11 #include "ignoregaps.h"
12 #include "eachgapdist.h"
13
14 //***************************************************************************************************************
Ccode(string filename,string temp,bool f,string mask,int win,int numW,string o)15 Ccode::Ccode(string filename, string temp, bool f, string mask, int win, int numW, string o) : MothurChimera() {
16 try {
17
18 fastafile = filename;
19 outputDir = o;
20 templateFileName = temp; templateSeqs = readSeqs(temp);
21 setMask(mask);
22 filter = f;
23 window = win;
24 numWanted = numW;
25
26 distCalc = new eachGapDist(1.0);
27 decalc = new DeCalculator();
28
29 Utils util;
30 mapInfo = outputDir + util.getRootName(util.getSimpleName(fastafile)) + "mapinfo";
31
32 ofstream out2;
33 util.openOutputFile(mapInfo, out2);
34
35 out2 << "Place in masked, filtered and trimmed sequence\tPlace in original alignment" << endl;
36 out2.close();
37
38 }
39 catch(exception& e) {
40 m->errorOut(e, "Ccode", "Ccode");
41 exit(1);
42 }
43 }
44 //***************************************************************************************************************
~Ccode()45 Ccode::~Ccode() {
46 delete distCalc;
47 delete decalc;
48 }
49 //***************************************************************************************************************
print(ostream & out,ostream & outAcc)50 Sequence Ccode::print(ostream& out, ostream& outAcc) {
51 try {
52
53 ofstream out2;
54 Utils util; util.openOutputFileAppend(mapInfo, out2);
55
56 out2 << querySeq->getName() << endl;
57 for (it = spotMap.begin(); it!= spotMap.end(); it++) {
58 out2 << it->first << '\t' << it->second << endl;
59 }
60 out2.close();
61 out << querySeq->getName() << endl << endl << "Reference sequences used and distance to query:" << endl;
62
63 for (int j = 0; j < closest.size(); j++) {
64 out << closest[j].seq->getName() << '\t' << closest[j].dist << endl;
65 }
66 out << endl << endl;
67
68 //for each window
69 //window mapping info.
70 out << "Mapping information: ";
71 //you mask and did not filter
72 if ((seqMask != "") && (!filter)) { out << "mask and trim."; }
73
74 //you filtered and did not mask
75 if ((seqMask == "") && (filter)) { out << "filter and trim."; }
76
77 //you masked and filtered
78 if ((seqMask != "") && (filter)) { out << "mask, filter and trim."; }
79
80 out << endl << "Window\tStartPos\tEndPos" << endl;
81 it = trim.begin();
82 for (int k = 0; k < windows.size()-1; k++) {
83 out << k+1 << '\t' << spotMap[windows[k]-it->first] << '\t' << spotMap[windows[k]-it->first+windowSizes] << endl;
84 }
85
86 out << windows.size() << '\t' << spotMap[windows[windows.size()-1]-it->first] << '\t' << spotMap[it->second-it->first-1] << endl;
87 out << endl;
88 out << "Window\tAvgQ\t(sdQ)\tAvgR\t(sdR)\tRatio\tAnova" << endl;
89 for (int k = 0; k < windows.size(); k++) {
90 float ds = averageQuery[k] / averageRef[k];
91 out << k+1 << '\t' << averageQuery[k] << '\t' << sdQuery[k] << '\t' << averageRef[k] << '\t'<< sdRef[k] << '\t' << ds << '\t' << anova[k] << endl;
92 }
93 out << endl;
94
95 //varRef
96 //varQuery
97 /* F test for differences among variances.
98 * varQuery is expected to be higher or similar than varRef */
99 //float fs = varQuery[query] / varRef[query]; /* F-Snedecor, test for differences of variances */
100
101 bool results = false;
102
103 //confidence limit, t - Student, anova
104 out << "Window\tConfidenceLimit\tt-Student\tAnova" << endl;
105
106 for (int k = 0; k < windows.size(); k++) {
107 string temp = "";
108 if (isChimericConfidence[k]) { temp += "*\t"; }
109 else { temp += "\t"; }
110
111 if (isChimericTStudent[k]) { temp += "*\t"; }
112 else { temp += "\t"; }
113
114 if (isChimericANOVA[k]) { temp += "*\t"; }
115 else { temp += "\t"; }
116
117 out << k+1 << '\t' << temp << endl;
118
119 if (temp == "*\t*\t*\t") { results = true; }
120 }
121 out << endl;
122
123 if (results) {
124 m->mothurOut(querySeq->getName() + " was found have at least one chimeric window.\n");
125 outAcc << querySeq->getName() << endl;
126 }
127
128 //free memory
129 for (int i = 0; i < closest.size(); i++) { delete closest[i].seq; }
130
131 return *querySeq;
132 }
133 catch(exception& e) {
134 m->errorOut(e, "Ccode", "print");
135 exit(1);
136 }
137 }
138 //***************************************************************************************************************
getChimeras(Sequence * query)139 int Ccode::getChimeras(Sequence* query) {
140 try {
141
142 closest.clear();
143 refCombo = 0;
144 sumRef.clear();
145 varRef.clear();
146 varQuery.clear();
147 sdRef.clear();
148 sdQuery.clear();
149 sumQuery.clear();
150 sumSquaredRef.clear();
151 sumSquaredQuery.clear();
152 averageRef.clear();
153 averageQuery.clear();
154 anova.clear();
155 isChimericConfidence.clear();
156 isChimericTStudent.clear();
157 isChimericANOVA.clear();
158 trim.clear();
159 spotMap.clear();
160 windowSizes = window;
161 windows.clear();
162
163
164 querySeq = query;
165
166 //find closest matches to query
167 closest = findClosest(query, numWanted);
168
169 if (m->getControl_pressed()) { return 0; }
170
171 //initialize spotMap
172 for (int i = 0; i < query->getAligned().length(); i++) { spotMap[i] = i; }
173
174 //mask sequences if the user wants to
175 if (seqMask != "") {
176 decalc->setMask(seqMask);
177
178 decalc->runMask(query);
179
180 //mask closest
181 for (int i = 0; i < closest.size(); i++) { decalc->runMask(closest[i].seq); }
182
183 spotMap = decalc->getMaskMap();
184 }
185
186 if (filter) {
187 vector<Sequence*> temp;
188 for (int i = 0; i < closest.size(); i++) { temp.push_back(closest[i].seq); }
189 temp.push_back(query);
190
191 createFilter(temp, 0.5);
192
193 for (int i = 0; i < temp.size(); i++) {
194 if (m->getControl_pressed()) { return 0; }
195 runFilter(temp[i]);
196 }
197
198 //update spotMap
199 map<int, int> newMap;
200 int spot = 0;
201
202 for (int i = 0; i < filterString.length(); i++) {
203 if (filterString[i] == '1') {
204 //add to newMap
205 newMap[spot] = spotMap[i];
206 spot++;
207 }
208 }
209 spotMap = newMap;
210 }
211
212 //trim sequences - this follows ccodes remove_extra_gaps
213 trimSequences(query);
214 if (m->getControl_pressed()) { return 0; }
215
216 //windows are equivalent to words - ccode paper recommends windows are between 5% and 20% on alignment length().
217 //Our default will be 10% and we will warn if user tries to use a window above or below these recommendations
218 windows = findWindows();
219 if (m->getControl_pressed()) { return 0; }
220
221 //remove sequences that are more than 20% different and less than 0.5% different - may want to allow user to specify this later
222 removeBadReferenceSeqs(closest);
223 if (m->getControl_pressed()) { return 0; }
224
225 //find the averages for each querys references
226 getAverageRef(closest); //fills sumRef, averageRef, sumSquaredRef and refCombo.
227 getAverageQuery(closest, query); //fills sumQuery, averageQuery, sumSquaredQuery.
228 if (m->getControl_pressed()) { return 0; }
229
230 //find the averages for each querys references
231 findVarianceRef(); //fills varRef and sdRef also sets minimum error rate to 0.001 to avoid divide by 0.
232 if (m->getControl_pressed()) { return 0; }
233
234 //find the averages for the query
235 findVarianceQuery(); //fills varQuery and sdQuery also sets minimum error rate to 0.001 to avoid divide by 0.
236 if (m->getControl_pressed()) { return 0; }
237
238 determineChimeras(); //fills anova, isChimericConfidence, isChimericTStudent and isChimericANOVA.
239 if (m->getControl_pressed()) { return 0; }
240
241 return 0;
242 }
243 catch(exception& e) {
244 m->errorOut(e, "Ccode", "getChimeras");
245 exit(1);
246 }
247 }
248 /***************************************************************************************************************/
249 //ccode algo says it does this to "Removes the initial and final gaps to avoid biases due to incomplete sequences."
trimSequences(Sequence * query)250 void Ccode::trimSequences(Sequence* query) {
251 try {
252
253 int frontPos = 0; //should contain first position in all seqs that is not a gap character
254 int rearPos = query->getAligned().length();
255
256 //********find first position in closest seqs that is a non gap character***********//
257 //find first position all query seqs that is a non gap character
258 for (int i = 0; i < closest.size(); i++) {
259
260 string aligned = closest[i].seq->getAligned();
261 int pos = 0;
262
263 //find first spot in this seq
264 for (int j = 0; j < aligned.length(); j++) {
265 if (isalpha(aligned[j])) {
266 pos = j;
267 break;
268 }
269 }
270
271 //save this spot if it is the farthest
272 if (pos > frontPos) { frontPos = pos; }
273 }
274
275 //find first position all querySeq[query] that is a non gap character
276 string aligned = query->getAligned();
277 int pos = 0;
278
279 //find first spot in this seq
280 for (int j = 0; j < aligned.length(); j++) {
281 if (isalpha(aligned[j])) {
282 pos = j;
283 break;
284 }
285 }
286
287 //save this spot if it is the farthest
288 if (pos > frontPos) { frontPos = pos; }
289
290
291 //********find last position in closest seqs that is a non gap character***********//
292 for (int i = 0; i < closest.size(); i++) {
293
294 string aligned = closest[i].seq->getAligned();
295 int pos = aligned.length();
296
297 //find first spot in this seq
298 for (int j = aligned.length()-1; j >= 0; j--) {
299 if (isalpha(aligned[j])) {
300 pos = j;
301 break;
302 }
303 }
304
305 //save this spot if it is the farthest
306 if (pos < rearPos) { rearPos = pos; }
307 }
308
309 //find last position all querySeqs[query] that is a non gap character
310 aligned = query->getAligned();
311 pos = aligned.length();
312
313 //find first spot in this seq
314 for (int j = aligned.length()-1; j >= 0; j--) {
315 if (isalpha(aligned[j])) {
316 pos = j;
317 break;
318 }
319 }
320
321 //save this spot if it is the farthest
322 if (pos < rearPos) { rearPos = pos; }
323
324
325 //check to make sure that is not whole seq
326 if ((rearPos - frontPos - 1) <= 0) { m->mothurOut("Error, when I trim your sequences, the entire sequence is trimmed.\n"); exit(1); }
327
328 map<int, int> tempTrim;
329 tempTrim[frontPos] = rearPos;
330
331 //save trimmed locations
332 trim = tempTrim;
333
334 //update spotMask
335 map<int, int> newMap;
336 int spot = 0;
337
338 for (int i = frontPos; i < rearPos; i++) {
339 //add to newMap
340 newMap[spot] = spotMap[i];
341 spot++;
342 }
343 spotMap = newMap;
344 }
345 catch(exception& e) {
346 m->errorOut(e, "Ccode", "trimSequences");
347 exit(1);
348 }
349 }
350 /***************************************************************************************************************/
findWindows()351 vector<int> Ccode::findWindows() {
352 try {
353
354 vector<int> win;
355 it = trim.begin();
356
357 int length = it->second - it->first;
358
359 //default is wanted = 10% of total length
360 if (windowSizes > length) {
361 m->mothurOut("You have slected a window larger than your sequence length after all filters, masks and trims have been done. I will use the default 10% of sequence length.");
362 windowSizes = length / 10;
363 }else if (windowSizes == 0) { windowSizes = length / 10; }
364 else if (windowSizes > (length * 0.20)) {
365 m->mothurOut("You have selected a window that is larger than 20% of your sequence length. This is not recommended, but I will continue anyway.\n");
366 }else if (windowSizes < (length * 0.05)) {
367 m->mothurOut("You have selected a window that is smaller than 5% of your sequence length. This is not recommended, but I will continue anyway.\n");
368 }
369
370 //save starting points of each window
371 for (int m = it->first; m < (it->second-windowSizes); m+=windowSizes) { win.push_back(m); }
372
373 //save last window
374 if (win[win.size()-1] < (it->first+length)) {
375 win.push_back(win[win.size()-1]+windowSizes); // ex. string length is 115, window is 25, without this you would get 0, 25, 50, 75
376 } //with this you would get 1,25,50,75,100
377
378 return win;
379 }
380 catch(exception& e) {
381 m->errorOut(e, "Ccode", "findWindows");
382 exit(1);
383 }
384 }
385 //***************************************************************************************************************
getDiff(string seqA,string seqB)386 int Ccode::getDiff(string seqA, string seqB) {
387 try {
388
389 int numDiff = 0;
390
391 for (int i = 0; i < seqA.length(); i++) {
392 //if you are both not gaps
393 //if (isalpha(seqA[i]) && isalpha(seqA[i])) {
394 //are you different
395 if (seqA[i] != seqB[i]) {
396 int ok; /* ok=1 means equivalent base. Checks for degenerate bases */
397
398 /* the char in base_a and base_b have been checked and they are different */
399 if ((seqA[i] == 'N') && (seqB[i] != '-')) ok = 1;
400 else if ((seqB[i] == 'N') && (seqA[i] != '-')) ok = 1;
401 else if ((seqA[i] == 'Y') && ((seqB[i] == 'C') || (seqB[i] == 'T'))) ok = 1;
402 else if ((seqB[i] == 'Y') && ((seqA[i] == 'C') || (seqA[i] == 'T'))) ok = 1;
403 else if ((seqA[i] == 'R') && ((seqB[i] == 'G') || (seqB[i] == 'A'))) ok = 1;
404 else if ((seqB[i] == 'R') && ((seqA[i] == 'G') || (seqA[i] == 'A'))) ok = 1;
405 else if ((seqA[i] == 'S') && ((seqB[i] == 'C') || (seqB[i] == 'G'))) ok = 1;
406 else if ((seqB[i] == 'S') && ((seqA[i] == 'C') || (seqA[i] == 'G'))) ok = 1;
407 else if ((seqA[i] == 'W') && ((seqB[i] == 'T') || (seqB[i] == 'A'))) ok = 1;
408 else if ((seqB[i] == 'W') && ((seqA[i] == 'T') || (seqA[i] == 'A'))) ok = 1;
409 else if ((seqA[i] == 'M') && ((seqB[i] == 'A') || (seqB[i] == 'C'))) ok = 1;
410 else if ((seqB[i] == 'M') && ((seqA[i] == 'A') || (seqA[i] == 'C'))) ok = 1;
411 else if ((seqA[i] == 'K') && ((seqB[i] == 'T') || (seqB[i] == 'G'))) ok = 1;
412 else if ((seqB[i] == 'K') && ((seqA[i] == 'T') || (seqA[i] == 'G'))) ok = 1;
413 else if ((seqA[i] == 'V') && ((seqB[i] == 'C') || (seqB[i] == 'A') || (seqB[i] == 'G'))) ok = 1;
414 else if ((seqB[i] == 'V') && ((seqA[i] == 'C') || (seqA[i] == 'A') || (seqA[i] == 'G'))) ok = 1;
415 else if ((seqA[i] == 'H') && ((seqB[i] == 'T') || (seqB[i] == 'A') || (seqB[i] == 'C'))) ok = 1;
416 else if ((seqB[i] == 'H') && ((seqA[i] == 'T') || (seqA[i] == 'A') || (seqA[i] == 'C'))) ok = 1;
417 else if ((seqA[i] == 'D') && ((seqB[i] == 'T') || (seqB[i] == 'A') || (seqB[i] == 'G'))) ok = 1;
418 else if ((seqB[i] == 'D') && ((seqA[i] == 'T') || (seqA[i] == 'A') || (seqA[i] == 'G'))) ok = 1;
419 else if ((seqA[i] == 'B') && ((seqB[i] == 'C') || (seqB[i] == 'T') || (seqB[i] == 'G'))) ok = 1;
420 else if ((seqB[i] == 'B') && ((seqA[i] == 'C') || (seqA[i] == 'T') || (seqA[i] == 'G'))) ok = 1;
421 else ok = 0; /* the bases are different and not equivalent */
422
423 //check if they are both blanks
424 if ((seqA[i] == '.') && (seqB[i] == '-')) ok = 1;
425 else if ((seqB[i] == '.') && (seqA[i] == '-')) ok = 1;
426
427 if (ok == 0) { numDiff++; }
428 }
429 //}
430 }
431
432 return numDiff;
433
434 }
435 catch(exception& e) {
436 m->errorOut(e, "Ccode", "getDiff");
437 exit(1);
438 }
439 }
440 //***************************************************************************************************************
441 //tried to make this look most like ccode original implementation
removeBadReferenceSeqs(vector<SeqDist> & seqs)442 void Ccode::removeBadReferenceSeqs(vector<SeqDist>& seqs) {
443 try {
444
445 vector< vector<int> > numDiffBases;
446 numDiffBases.resize(seqs.size());
447 //initialize to 0
448 for (int i = 0; i < numDiffBases.size(); i++) { numDiffBases[i].resize(seqs.size(),0); }
449
450 it = trim.begin();
451 int length = it->second - it->first;
452
453 //calc differences from each sequence to everyother seq in the set
454 for (int i = 0; i < seqs.size(); i++) {
455
456 string seqA = seqs[i].seq->getAligned().substr(it->first, length);
457
458 //so you don't calc i to j and j to i since they are the same
459 for (int j = 0; j < i; j++) {
460
461 string seqB = seqs[j].seq->getAligned().substr(it->first, length);
462
463 //compare strings
464 int numDiff = getDiff(seqA, seqB);
465
466 numDiffBases[i][j] = numDiff;
467 numDiffBases[j][i] = numDiff;
468 }
469 }
470
471 //initailize remove to 0
472 vector<int> remove; remove.resize(seqs.size(), 0);
473 float top = ((20*length) / (float) 100);
474 float bottom = ((0.5*length) / (float) 100);
475
476 //check each numDiffBases and if any are higher than threshold set remove to 1 so you can remove those seqs from the closest set
477 for (int i = 0; i < numDiffBases.size(); i++) {
478 for (int j = 0; j < i; j++) {
479 //are you more than 20% different
480 if (numDiffBases[i][j] > top) { remove[j] = 1; }
481 //are you less than 0.5% different
482 if (numDiffBases[i][j] < bottom) { remove[j] = 1; }
483 }
484 }
485
486 int numSeqsLeft = 0;
487
488 //count seqs that are not going to be removed
489 for (int i = 0; i < remove.size(); i++) {
490 if (remove[i] == 0) { numSeqsLeft++; }
491 }
492
493 //if you have enough then remove bad ones
494 if (numSeqsLeft >= 3) {
495 vector<SeqDist> goodSeqs;
496 //remove bad seqs
497 for (int i = 0; i < remove.size(); i++) {
498 if (remove[i] == 0) {
499 goodSeqs.push_back(seqs[i]);
500 }
501 }
502
503 seqs = goodSeqs;
504
505 }else { //warn, but dont remove any
506 m->mothurOut(querySeq->getName() + " does not have an adaquate number of reference sequences that are within 20% and 0.5% similarity. I will continue, but please check.\n");
507 }
508
509 }
510 catch(exception& e) {
511 m->errorOut(e, "Ccode", "removeBadReferenceSeqs");
512 exit(1);
513 }
514 }
515 //***************************************************************************************************************
516 //makes copy of templateseq for filter
findClosest(Sequence * q,int numWanted)517 vector<SeqDist> Ccode::findClosest(Sequence* q, int numWanted) {
518 try{
519
520 vector<SeqDist> topMatches;
521
522 Sequence query = *(q);
523
524 //calc distance to each sequence in template seqs
525 for (int i = 0; i < templateSeqs.size(); i++) {
526
527 Sequence ref = *(templateSeqs[i]);
528
529 //find overall dist
530 double dist = distCalc->calcDist(query, ref);
531
532 //save distance
533 SeqDist temp;
534 temp.seq = new Sequence(templateSeqs[i]->getName(), templateSeqs[i]->getAligned());
535 temp.dist = dist;
536
537 topMatches.push_back(temp);
538 }
539
540 sort(topMatches.begin(), topMatches.end(), compareSeqDist);
541
542 for (int i = numWanted; i < topMatches.size(); i++) { delete topMatches[i].seq; }
543
544 topMatches.resize(numWanted);
545
546 return topMatches;
547
548 }
549 catch(exception& e) {
550 m->errorOut(e, "Ccode", "findClosestSides");
551 exit(1);
552 }
553 }
554 /**************************************************************************************************/
555 //find the distances from each reference sequence to every other reference sequence for each window for this query
getAverageRef(vector<SeqDist> ref)556 void Ccode::getAverageRef(vector<SeqDist> ref) {
557 try {
558
559 vector< vector< vector<int> > > diffs; //diffs[0][1][2] is the number of differences between ref seq 0 and ref seq 1 at window 2.
560
561 //initialize diffs vector
562 diffs.resize(ref.size());
563 for (int i = 0; i < diffs.size(); i++) {
564 diffs[i].resize(ref.size());
565 for (int j = 0; j < diffs[i].size(); j++) {
566 diffs[i][j].resize(windows.size(), 0);
567 }
568 }
569
570 it = trim.begin();
571
572 //find the distances from each reference sequence to every other reference sequence for each window for this query
573 for (int i = 0; i < ref.size(); i++) {
574
575 string refI = ref[i].seq->getAligned();
576
577 //j<i, so you don't find distances from i to j and then j to i.
578 for (int j = 0; j < i; j++) {
579
580 string refJ = ref[j].seq->getAligned();
581
582 for (int k = 0; k < windows.size(); k++) {
583
584 string refIWindowk, refJWindowk;
585
586 if (k < windows.size()-1) {
587 //get window strings
588 refIWindowk = refI.substr(windows[k], windowSizes);
589 refJWindowk = refJ.substr(windows[k], windowSizes);
590 }else { //last window may be smaller than rest - see findwindows
591 //get window strings
592 refIWindowk = refI.substr(windows[k], (it->second-windows[k]));
593 refJWindowk = refJ.substr(windows[k], (it->second-windows[k]));
594 }
595
596 //find differences
597 int diff = getDiff(refIWindowk, refJWindowk);
598
599 //save differences in [i][j][k] and [j][i][k] since they are the same
600 diffs[i][j][k] = diff;
601 diffs[j][i][k] = diff;
602
603 }//k
604
605 }//j
606
607 }//i
608
609 //initialize sumRef for this query
610 sumRef.resize(windows.size(), 0);
611 sumSquaredRef.resize(windows.size(), 0);
612 averageRef.resize(windows.size(), 0);
613
614 //find the sum of the differences for hte reference sequences
615 for (int i = 0; i < diffs.size(); i++) {
616 for (int j = 0; j < i; j++) {
617
618 //increment this querys reference sequences combos
619 refCombo++;
620
621 for (int k = 0; k < diffs[i][j].size(); k++) {
622 sumRef[k] += diffs[i][j][k];
623 sumSquaredRef[k] += (diffs[i][j][k]*diffs[i][j][k]);
624 }//k
625
626 }//j
627 }//i
628
629
630 //find the average of the differences for the references for each window
631 for (int i = 0; i < windows.size(); i++) {
632 averageRef[i] = sumRef[i] / (float) refCombo;
633 }
634
635 }
636 catch(exception& e) {
637 m->errorOut(e, "Ccode", "getAverageRef");
638 exit(1);
639 }
640 }
641 /**************************************************************************************************/
getAverageQuery(vector<SeqDist> ref,Sequence * query)642 void Ccode::getAverageQuery (vector<SeqDist> ref, Sequence* query) {
643 try {
644
645 vector< vector<int> > diffs; //diffs[1][2] is the number of differences between querySeqs[query] and ref seq 1 at window 2.
646
647 //initialize diffs vector
648 diffs.resize(ref.size());
649 for (int j = 0; j < diffs.size(); j++) {
650 diffs[j].resize(windows.size(), 0);
651 }
652
653 it = trim.begin();
654
655 string refQuery = query->getAligned();
656
657 //j<i, so you don't find distances from i to j and then j to i.
658 for (int j = 0; j < ref.size(); j++) {
659
660 string refJ = ref[j].seq->getAligned();
661
662 for (int k = 0; k < windows.size(); k++) {
663
664 string QueryWindowk, refJWindowk;
665
666 if (k < windows.size()-1) {
667 //get window strings
668 QueryWindowk = refQuery.substr(windows[k], windowSizes);
669 refJWindowk = refJ.substr(windows[k], windowSizes);
670 }else { //last window may be smaller than rest - see findwindows
671 //get window strings
672 QueryWindowk = refQuery.substr(windows[k], (it->second-windows[k]));
673 refJWindowk = refJ.substr(windows[k], (it->second-windows[k]));
674 }
675
676 //find differences
677 int diff = getDiff(QueryWindowk, refJWindowk);
678
679 //save differences
680 diffs[j][k] = diff;
681
682 }//k
683 }//j
684
685
686 //initialize sumRef for this query
687 sumQuery.resize(windows.size(), 0);
688 sumSquaredQuery.resize(windows.size(), 0);
689 averageQuery.resize(windows.size(), 0);
690
691 //find the sum of the differences
692 for (int j = 0; j < diffs.size(); j++) {
693 for (int k = 0; k < diffs[j].size(); k++) {
694 sumQuery[k] += diffs[j][k];
695 sumSquaredQuery[k] += (diffs[j][k]*diffs[j][k]);
696 }//k
697 }//j
698
699
700 //find the average of the differences for the references for each window
701 for (int i = 0; i < windows.size(); i++) {
702 averageQuery[i] = sumQuery[i] / (float) ref.size();
703 }
704 }
705 catch(exception& e) {
706 m->errorOut(e, "Ccode", "getAverageQuery");
707 exit(1);
708 }
709 }
710 /**************************************************************************************************/
findVarianceRef()711 void Ccode::findVarianceRef() {
712 try {
713
714 varRef.resize(windows.size(), 0);
715 sdRef.resize(windows.size(), 0);
716
717 //for each window
718 for (int i = 0; i < windows.size(); i++) {
719 varRef[i] = (sumSquaredRef[i] - ((sumRef[i]*sumRef[i])/(float)refCombo)) / (float)(refCombo-1);
720 sdRef[i] = sqrt(varRef[i]);
721
722 //set minimum error rate to 0.001 - to avoid potential divide by zero - not sure if this is necessary but it follows ccode implementation
723 if (averageRef[i] < 0.001) { averageRef[i] = 0.001; }
724 if (sumRef[i] < 0.001) { sumRef[i] = 0.001; }
725 if (varRef[i] < 0.001) { varRef[i] = 0.001; }
726 if (sumSquaredRef[i] < 0.001) { sumSquaredRef[i] = 0.001; }
727 if (sdRef[i] < 0.001) { sdRef[i] = 0.001; }
728
729 }
730 }
731 catch(exception& e) {
732 m->errorOut(e, "Ccode", "findVarianceRef");
733 exit(1);
734 }
735 }
736 /**************************************************************************************************/
findVarianceQuery()737 void Ccode::findVarianceQuery() {
738 try {
739 varQuery.resize(windows.size(), 0);
740 sdQuery.resize(windows.size(), 0);
741
742 //for each window
743 for (int i = 0; i < windows.size(); i++) {
744 varQuery[i] = (sumSquaredQuery[i] - ((sumQuery[i]*sumQuery[i])/(float) closest.size())) / (float) (closest.size()-1);
745 sdQuery[i] = sqrt(varQuery[i]);
746
747 //set minimum error rate to 0.001 - to avoid potential divide by zero - not sure if this is necessary but it follows ccode implementation
748 if (averageQuery[i] < 0.001) { averageQuery[i] = 0.001; }
749 if (sumQuery[i] < 0.001) { sumQuery[i] = 0.001; }
750 if (varQuery[i] < 0.001) { varQuery[i] = 0.001; }
751 if (sumSquaredQuery[i] < 0.001) { sumSquaredQuery[i] = 0.001; }
752 if (sdQuery[i] < 0.001) { sdQuery[i] = 0.001; }
753 }
754
755 }
756 catch(exception& e) {
757 m->errorOut(e, "Ccode", "findVarianceQuery");
758 exit(1);
759 }
760 }
761 /**************************************************************************************************/
determineChimeras()762 void Ccode::determineChimeras() {
763 try {
764
765 isChimericConfidence.resize(windows.size(), false);
766 isChimericTStudent.resize(windows.size(), false);
767 isChimericANOVA.resize(windows.size(), false);
768 anova.resize(windows.size());
769
770
771 //for each window
772 for (int i = 0; i < windows.size(); i++) {
773
774 //get confidence limits
775 float t = getT(closest.size()-1); //how many seqs you are comparing to this querySeq
776 float dsUpper = (averageQuery[i] + (t * sdQuery[i])) / averageRef[i];
777 float dsLower = (averageQuery[i] - (t * sdQuery[i])) / averageRef[i];
778
779 if ((dsUpper > 1.0) && (dsLower > 1.0) && (averageQuery[i] > averageRef[i])) { /* range does not include 1 */
780 isChimericConfidence[i] = true; /* significantly higher at P<0.05 */
781
782 }
783
784 //student t test
785 int degreeOfFreedom = refCombo + closest.size() - 2;
786 float denomForT = (((refCombo-1) * varQuery[i] + (closest.size() - 1) * varRef[i]) / (float) degreeOfFreedom) * ((refCombo + closest.size()) / (float) (refCombo * closest.size())); /* denominator, without sqrt(), for ts calculations */
787
788 float ts = fabs((averageQuery[i] - averageRef[i]) / (sqrt(denomForT))); /* value of ts for t-student test */
789 t = getT(degreeOfFreedom);
790
791 if ((ts >= t) && (averageQuery[i] > averageRef[i])) {
792 isChimericTStudent[i] = true; /* significantly higher at P<0.05 */
793 }
794
795 //anova test
796 float value1 = sumQuery[i] + sumRef[i];
797 float value2 = sumSquaredQuery[i] + sumSquaredRef[i];
798 float value3 = ((sumQuery[i]*sumQuery[i]) / (float) (closest.size())) + ((sumRef[i] * sumRef[i]) / (float) refCombo);
799 float value4 = (value1 * value1) / ( (float) (closest.size() + refCombo) );
800 float value5 = value2 - value4;
801 float value6 = value3 - value4;
802 float value7 = value5 - value6;
803 float value8 = value7 / ((float) degreeOfFreedom);
804 float anovaValue = value6 / value8;
805
806 float f = getF(degreeOfFreedom);
807
808 if ((anovaValue >= f) && (averageQuery[i] > averageRef[i])) {
809 isChimericANOVA[i] = true; /* significant P<0.05 */
810 }
811
812 if (isnan(anovaValue) || isinf(anovaValue)) { anovaValue = 0.0; }
813
814 anova[i] = anovaValue;
815 }
816
817 }
818 catch(exception& e) {
819 m->errorOut(e, "Ccode", "determineChimeras");
820 exit(1);
821 }
822 }
823 /**************************************************************************************************/
getT(int numseq)824 float Ccode::getT(int numseq) {
825 try {
826
827 float tvalue = 0;
828
829 /* t-student critical values for different degrees of freedom and alpha 0.1 in one-tail tests (equivalent to 0.05) */
830 if (numseq > 120) tvalue = 1.645;
831 else if (numseq > 60) tvalue = 1.658;
832 else if (numseq > 40) tvalue = 1.671;
833 else if (numseq > 30) tvalue = 1.684;
834 else if (numseq > 29) tvalue = 1.697;
835 else if (numseq > 28) tvalue = 1.699;
836 else if (numseq > 27) tvalue = 1.701;
837 else if (numseq > 26) tvalue = 1.703;
838 else if (numseq > 25) tvalue = 1.706;
839 else if (numseq > 24) tvalue = 1.708;
840 else if (numseq > 23) tvalue = 1.711;
841 else if (numseq > 22) tvalue = 1.714;
842 else if (numseq > 21) tvalue = 1.717;
843 else if (numseq > 20) tvalue = 1.721;
844 else if (numseq > 19) tvalue = 1.725;
845 else if (numseq > 18) tvalue = 1.729;
846 else if (numseq > 17) tvalue = 1.734;
847 else if (numseq > 16) tvalue = 1.740;
848 else if (numseq > 15) tvalue = 1.746;
849 else if (numseq > 14) tvalue = 1.753;
850 else if (numseq > 13) tvalue = 1.761;
851 else if (numseq > 12) tvalue = 1.771;
852 else if (numseq > 11) tvalue = 1.782;
853 else if (numseq > 10) tvalue = 1.796;
854 else if (numseq > 9) tvalue = 1.812;
855 else if (numseq > 8) tvalue = 1.833;
856 else if (numseq > 7) tvalue = 1.860;
857 else if (numseq > 6) tvalue = 1.895;
858 else if (numseq > 5) tvalue = 1.943;
859 else if (numseq > 4) tvalue = 2.015;
860 else if (numseq > 3) tvalue = 2.132;
861 else if (numseq > 2) tvalue = 2.353;
862 else if (numseq > 1) tvalue = 2.920;
863 else if (numseq <= 1) {
864 m->mothurOut("Two or more reference sequences are required, your data will be flawed.\n\n");
865 }
866
867 return tvalue;
868 }
869 catch(exception& e) {
870 m->errorOut(e, "Ccode", "getT");
871 exit(1);
872 }
873 }
874 /**************************************************************************************************/
getF(int numseq)875 float Ccode::getF(int numseq) {
876 try {
877
878 float fvalue = 0;
879
880 /* F-Snedecor critical values for v1=1 and different degrees of freedom v2 and alpha 0.05 */
881 if (numseq > 120) fvalue = 3.84;
882 else if (numseq > 60) fvalue = 3.92;
883 else if (numseq > 40) fvalue = 4.00;
884 else if (numseq > 30) fvalue = 4.08;
885 else if (numseq > 29) fvalue = 4.17;
886 else if (numseq > 28) fvalue = 4.18;
887 else if (numseq > 27) fvalue = 4.20;
888 else if (numseq > 26) fvalue = 4.21;
889 else if (numseq > 25) fvalue = 4.23;
890 else if (numseq > 24) fvalue = 4.24;
891 else if (numseq > 23) fvalue = 4.26;
892 else if (numseq > 22) fvalue = 4.28;
893 else if (numseq > 21) fvalue = 4.30;
894 else if (numseq > 20) fvalue = 4.32;
895 else if (numseq > 19) fvalue = 4.35;
896 else if (numseq > 18) fvalue = 4.38;
897 else if (numseq > 17) fvalue = 4.41;
898 else if (numseq > 16) fvalue = 4.45;
899 else if (numseq > 15) fvalue = 4.49;
900 else if (numseq > 14) fvalue = 4.54;
901 else if (numseq > 13) fvalue = 4.60;
902 else if (numseq > 12) fvalue = 4.67;
903 else if (numseq > 11) fvalue = 4.75;
904 else if (numseq > 10) fvalue = 4.84;
905 else if (numseq > 9) fvalue = 4.96;
906 else if (numseq > 8) fvalue = 5.12;
907 else if (numseq > 7) fvalue = 5.32;
908 else if (numseq > 6) fvalue = 5.59;
909 else if (numseq > 5) fvalue = 5.99;
910 else if (numseq > 4) fvalue = 6.61;
911 else if (numseq > 3) fvalue = 7.71;
912 else if (numseq > 2) fvalue = 10.1;
913 else if (numseq > 1) fvalue = 18.5;
914 else if (numseq > 0) fvalue = 161;
915 else if (numseq <= 0) {
916 m->mothurOut("Two or more reference sequences are required, your data will be flawed.\n\n");
917 }
918
919 return fvalue;
920 }
921 catch(exception& e) {
922 m->errorOut(e, "Ccode", "getF");
923 exit(1);
924 }
925 }
926 //***************************************************************************************************************
927
928
929
930