1 /*
2 * refchimeratest.cpp
3 * Mothur
4 *
5 * Created by Pat Schloss on 1/31/11.
6 * Copyright 2011 Schloss Lab. All rights reserved.
7 *
8 */
9
10 #include "refchimeratest.h"
11 #include "mothur.h"
12
13 int MAXINT = numeric_limits<int>::max();
14
15 //***************************************************************************************************************
16
RefChimeraTest(vector<Sequence> & refs,bool aligned)17 RefChimeraTest::RefChimeraTest(vector<Sequence>& refs, bool aligned) : aligned(aligned){
18
19 m = MothurOut::getInstance();
20
21 numRefSeqs = refs.size();
22
23 referenceSeqs.resize(numRefSeqs);
24 referenceNames.resize(numRefSeqs);
25 for(int i=0;i<numRefSeqs;i++){
26 if (aligned) { referenceSeqs[i] = refs[i].getAligned(); }
27 else { referenceSeqs[i] = refs[i].getUnaligned(); }
28 referenceNames[i] = refs[i].getName();
29 }
30
31 alignLength = referenceSeqs[0].length();
32 bestMatch = 0;
33
34 }
35 //***************************************************************************************************************
36
getHeader()37 string RefChimeraTest::getHeader(){
38 try {
39 return ("queryName\tbestRef\tbestSequenceMismatch\tleftParentChi,rightParentChi\tbreakPointChi\tminMismatchToChimera\tdistToBestMera\tnumParents\n");
40
41 }catch(exception& e) {
42 m->errorOut(e, "RefChimeraTest", "printHeader");
43 exit(1);
44 }
45 }
46
47 //***************************************************************************************************************
48
analyzeQuery(string queryName,string querySeq,string & output)49 int RefChimeraTest::analyzeQuery(string queryName, string querySeq, string& output){
50
51 int numParents = -1;
52
53 if(aligned){
54 numParents = analyzeAlignedQuery(queryName, querySeq, output);
55 }
56 else{
57 numParents = analyzeUnalignedQuery(queryName, querySeq, output);
58 }
59
60 return numParents;
61
62 }
63
64 //***************************************************************************************************************
65
analyzeAlignedQuery(string queryName,string querySeq,string & output)66 int RefChimeraTest::analyzeAlignedQuery(string queryName, string querySeq, string& output){
67
68 vector<vector<int> > left; left.resize(numRefSeqs);
69 vector<vector<int> > right; right.resize(numRefSeqs);
70 vector<int> singleLeft, bestLeft;
71 vector<int> singleRight, bestRight;
72
73 for(int i=0;i<numRefSeqs;i++){
74 left[i].assign(alignLength, 0);
75 right[i].assign(alignLength, 0);
76 }
77
78 int bestMatchIndex = 0;
79 int bestSequenceMismatch = getAlignedMismatches(querySeq, left, right, bestMatchIndex);
80
81 int leftParentBi, rightParentBi, breakPointBi;
82 int minMismatchToChimera = getChimera(left, right, leftParentBi, rightParentBi, breakPointBi, singleLeft, bestLeft, singleRight, bestRight);
83
84 int nMera = 0;
85 string chimeraRefSeq = "";
86
87 if(bestSequenceMismatch - minMismatchToChimera >= 3){// || (minMismatchToChimera == 0 && bestSequenceMismatch != 0)){
88 nMera = 2;
89 chimeraRefSeq = stitchBimera(leftParentBi, rightParentBi, breakPointBi);
90 }
91 else{
92 nMera = 1;
93 chimeraRefSeq = referenceSeqs[bestMatchIndex];
94 }
95
96 bestRefAlignment = chimeraRefSeq;
97 bestQueryAlignment = querySeq;
98
99 double distToChimera = calcDistToChimera(bestQueryAlignment, bestRefAlignment);
100
101 output = queryName + "\t" + referenceNames[bestMatchIndex] + "\t" + toString(bestSequenceMismatch) + "\t";
102 output += referenceNames[leftParentBi] + ',' + referenceNames[rightParentBi] + "\t" + toString(breakPointBi) + "\t";
103 output += toString(minMismatchToChimera) + "\t";
104 output += toString(distToChimera) + "\t" + toString(nMera) +"\n";
105
106 bestMatch = bestMatchIndex;
107
108 return nMera;
109 }
110
111 //***************************************************************************************************************
112
analyzeUnalignedQuery(string queryName,string querySeq,string & output)113 int RefChimeraTest::analyzeUnalignedQuery(string queryName, string querySeq, string& output){
114
115 int nMera = 0;
116
117 int seqLength = querySeq.length();
118
119 vector<string> queryAlign; queryAlign.resize(numRefSeqs);
120 vector<string> refAlign; refAlign.resize(numRefSeqs);
121
122 vector<vector<int> > leftDiffs; leftDiffs.resize(numRefSeqs);
123 vector<vector<int> > rightDiffs; rightDiffs.resize(numRefSeqs);
124 vector<vector<int> > leftMaps; leftMaps.resize(numRefSeqs);
125 vector<vector<int> > rightMaps; rightMaps.resize(numRefSeqs);
126
127 int bestRefIndex = -1;
128 int bestRefDiffs = numeric_limits<int>::max();
129 double bestRefLength = 0;
130
131 // if (queryName == "OTU_1008") {
132 // cout << queryName << endl << querySeq << endl << endl;
133 // }
134 for(int i=0;i<numRefSeqs;i++){
135 double length = 0;
136 double diffs = alignQueryToReferences(querySeq, referenceSeqs[i], queryAlign[i], refAlign[i], length);
137 if(diffs < bestRefDiffs){
138 bestRefDiffs = diffs;
139 bestRefLength = length;
140 //if (queryName == "OTU_1008") {
141 // cout << queryName << endl << queryAlign[i] << endl << endl << refAlign[i] << endl;
142 //}
143 bestRefIndex = i;
144 }
145 }
146
147
148 if(bestRefDiffs >= 3){
149 for(int i=0;i<numRefSeqs;i++){
150 leftDiffs[i].assign(seqLength, 0);
151 rightDiffs[i].assign(seqLength, 0);
152 leftMaps[i].assign(seqLength, 0);
153 rightMaps[i].assign(seqLength, 0);
154
155 getUnalignedDiffs(queryAlign[i], refAlign[i], leftDiffs[i], leftMaps[i], rightDiffs[i], rightMaps[i]);
156 }
157
158 vector<int> singleLeft(seqLength, numeric_limits<int>::max());
159 vector<int> bestLeft(seqLength, -1);
160
161 for(int l=0;l<seqLength;l++){
162
163 for(int i=0;i<numRefSeqs;i++){
164 if(leftDiffs[i][l] < singleLeft[l]){
165 singleLeft[l] = leftDiffs[i][l];
166 bestLeft[l] = i;
167 }
168 }
169 }
170
171 vector<int> singleRight(seqLength, numeric_limits<int>::max());
172 vector<int> bestRight(seqLength, -1);
173
174 for(int l=0;l<seqLength;l++){
175
176 for(int i=0;i<numRefSeqs;i++){
177 if(rightDiffs[i][l] < singleRight[l]){
178 singleRight[l] = rightDiffs[i][l];
179 bestRight[l] = i;
180 }
181 }
182 }
183
184 int bestChimeraMismatches = numeric_limits<int>::max();
185 int leftParent = 0;
186 int rightParent = 0;
187 int breakPoint = 0;
188
189 for(int l=0;l<seqLength-1;l++){
190
191 int chimera = singleLeft[l] + singleRight[seqLength - l - 2];
192 if(chimera < bestChimeraMismatches){
193 bestChimeraMismatches = chimera;
194 breakPoint = l;
195 leftParent = bestLeft[l];
196 rightParent = bestRight[seqLength - l - 2];
197 }
198 }
199
200 string reference;
201
202 if(bestRefDiffs - bestChimeraMismatches >= 3){// || (minMismatchToChimera == 0 && bestSequenceMismatch != 0)){
203 nMera = 2;
204
205 int breakLeft = leftMaps[leftParent][breakPoint];
206 int breakRight = rightMaps[rightParent][rightMaps[rightParent].size() - breakPoint - 2];
207
208 string left = refAlign[leftParent];
209 string right = refAlign[rightParent];
210
211 for(int i=0;i<=breakLeft;i++){
212
213 if (m->getControl_pressed()) { return 0; }
214
215 if(left[i] != '-' && left[i] != '.'){
216 reference += left[i];
217 }
218 }
219
220
221 for(int i=breakRight;i<right.length();i++){
222
223 if (m->getControl_pressed()) { return 0; }
224
225 if(right[i] != '-' && right[i] != '.'){
226 reference += right[i];
227 }
228 }
229
230 }
231 else{
232 nMera = 1;
233 reference = referenceSeqs[bestRefIndex];
234 }
235
236 double alignLength = 0.0;
237 double finalDiffs = alignQueryToReferences(querySeq, reference, bestQueryAlignment, bestRefAlignment, alignLength);
238 double finalDistance = finalDiffs / alignLength;
239
240 output = queryName + "\t" + referenceNames[bestRefIndex] + "\t" + toString(bestRefDiffs) + "\t";
241 output += referenceNames[leftParent] + ',' + referenceNames[rightParent] + "\t" + toString(breakPoint) + "\t";
242 output += toString(bestChimeraMismatches) + "\t";
243 output += toString(finalDistance) + "\t" + toString(nMera) +"\n";
244 }
245 else{
246 bestQueryAlignment = queryAlign[bestRefIndex];
247 bestRefAlignment = refAlign[bestRefIndex];
248 nMera = 1;
249
250 output = queryName + "\t" + referenceNames[bestRefIndex] + "\t" + toString(bestRefDiffs) + "\tNA\tNA\tNA\tNA\t1\n";
251 }
252
253 bestMatch = bestRefIndex;
254 return nMera;
255 }
256
257 /**************************************************************************************************/
258
alignQueryToReferences(string query,string reference,string & qAlign,string & rAlign,double & length)259 double RefChimeraTest::alignQueryToReferences(string query, string reference, string& qAlign, string& rAlign, double& length){
260
261
262 try {
263 double GAP = -5;
264 double MATCH = 1;
265 double MISMATCH = -1;
266
267 int queryLength = query.length();
268 int refLength = reference.length();
269
270 vector<vector<double> > alignMatrix; alignMatrix.resize(queryLength + 1);
271 vector<vector<char> > alignMoves; alignMoves.resize(queryLength + 1);
272
273 for(int i=0;i<=queryLength;i++){
274 if (m->getControl_pressed()) { return 0; }
275 alignMatrix[i].resize(refLength + 1, 0);
276 alignMoves[i].resize(refLength + 1, 'x');
277 }
278
279 for(int i=0;i<=queryLength;i++){
280 if (m->getControl_pressed()) { return 0; }
281 alignMatrix[i][0] = 0;//GAP * i;
282 alignMoves[i][0] = 'u';
283 }
284
285 for(int i=0;i<=refLength;i++){
286 if (m->getControl_pressed()) { return 0; }
287 alignMatrix[0][i] = 0;//GAP * i;
288 alignMoves[0][i] = 'l';
289 }
290
291 for(int i=1;i<=queryLength;i++){
292
293 if (m->getControl_pressed()) { return 0; }
294
295 for(int j=1;j<=refLength;j++){
296
297 double nogapScore;
298 if(query[i-1] == reference[j-1]){ nogapScore = alignMatrix[i-1][j-1] + MATCH; }
299 else { nogapScore = alignMatrix[i-1][j-1] + MISMATCH; }
300
301 double leftScore;
302 if(i == queryLength) { leftScore = alignMatrix[i][j-1]; }
303 else { leftScore = alignMatrix[i][j-1] + GAP; }
304
305
306 double upScore;
307 if(j == refLength) { upScore = alignMatrix[i-1][j]; }
308 else { upScore = alignMatrix[i-1][j] + GAP; }
309
310 if(nogapScore > leftScore){
311 if(nogapScore > upScore){
312 alignMoves[i][j] = 'd';
313 alignMatrix[i][j] = nogapScore;
314 }
315 else{
316 alignMoves[i][j] = 'u';
317 alignMatrix[i][j] = upScore;
318 }
319 }
320 else{
321 if(leftScore > upScore){
322 alignMoves[i][j] = 'l';
323 alignMatrix[i][j] = leftScore;
324 }
325 else{
326 alignMoves[i][j] = 'u';
327 alignMatrix[i][j] = upScore;
328 }
329 }
330 }
331 }
332
333 int end = refLength - 1;
334 int maxRow = 0;
335 double maxRowValue = -2147483647;
336 for(int i=0;i<queryLength;i++){
337 if(alignMatrix[i][end] > maxRowValue){
338 maxRow = i;
339 maxRowValue = alignMatrix[i][end];
340 }
341 }
342
343 end = queryLength - 1;
344 int maxColumn = 0;
345 double maxColumnValue = -2147483647;
346
347 for(int j=0;j<refLength;j++){
348 if(alignMatrix[end][j] > maxColumnValue){
349 maxColumn = j;
350 maxColumnValue = alignMatrix[end][j];
351 }
352 }
353
354 int row = queryLength-1;
355 int column = refLength-1;
356
357 if(maxColumn == column && maxRow == row){} // if the max values are the lower right corner, then we're good
358 else if(alignMatrix[row][maxColumn] < alignMatrix[maxRow][column]){
359 for(int i=maxRow+1;i<queryLength;i++){ // decide whether sequence A or B needs the gaps at the end either set
360 alignMoves[i][column] = 'u';// the pointer upwards or...
361 }
362
363 }
364 else {
365 for(int i=maxColumn+1;i<refLength;i++){
366 alignMoves[row][i] = 'l'; // ...to the left
367 }
368 }
369
370 int i = queryLength;
371 int j = refLength;
372
373
374 qAlign = "";
375 rAlign = "";
376
377 int diffs = 0;
378 length = 0;
379
380 while(i > 0 && j > 0){
381
382 if (m->getControl_pressed()) { return 0; }
383
384 if(alignMoves[i][j] == 'd'){
385 qAlign = query[i-1] + qAlign;
386 rAlign = reference[j-1] + rAlign;
387
388 if(query[i-1] != reference[j-1]){ diffs++; }
389 length++;
390
391 i--;
392 j--;
393 }
394 else if(alignMoves[i][j] == 'u'){
395 qAlign = query[i-1] + qAlign;
396
397 if(j != refLength) { rAlign = '-' + rAlign; diffs++; length++; }
398 else { rAlign = '.' + rAlign; }
399 i--;
400 }
401 else if(alignMoves[i][j] == 'l'){
402 rAlign = reference[j-1] + rAlign;
403
404 if(i != queryLength){ qAlign = '-' + qAlign; diffs++; length++; }
405 else { qAlign = '.' + qAlign; }
406 j--;
407 }
408 }
409
410 if(i>0){
411 qAlign = query.substr(0, i) + qAlign;
412 rAlign = string(i, '.') + rAlign;
413 }
414 else if(j>0){
415 qAlign = string(j, '.') + qAlign;
416 rAlign = reference.substr(0, j) + rAlign;
417 }
418
419 if (length == 0) { diffs = MAXINT; }
420
421 return diffs;
422 }
423 catch(exception& e) {
424 m->errorOut(e, "RefChimeraTest", "alignQueryToReferences");
425 exit(1);
426 }
427 }
428
429 /**************************************************************************************************/
430
getUnalignedDiffs(string qAlign,string rAlign,vector<int> & leftDiffs,vector<int> & leftMap,vector<int> & rightDiffs,vector<int> & rightMap)431 int RefChimeraTest::getUnalignedDiffs(string qAlign, string rAlign, vector<int>& leftDiffs, vector<int>& leftMap, vector<int>& rightDiffs, vector<int>& rightMap){
432 try {
433 int alignLength = qAlign.length();
434
435 int lDiffs = 0;
436 int lCount = 0;
437 for(int l=0;l<alignLength;l++){
438
439 if (m->getControl_pressed()) { return 0; }
440
441 if(qAlign[l] == '-'){
442 lDiffs++;
443 }
444 else if(qAlign[l] != '.'){
445
446 if(rAlign[l] == '-'){
447 lDiffs++;
448 }
449 else if(qAlign[l] != rAlign[l]){;// && rAlign[l] != '.'){
450 lDiffs++;
451 }
452 leftDiffs[lCount] = lDiffs;
453 leftMap[lCount] = l;
454
455 lCount++;
456 }
457 }
458
459 int rDiffs = 0;
460 int rCount = 0;
461 for(int l=alignLength-1;l>=0;l--){
462
463 if (m->getControl_pressed()) { return 0; }
464
465 if(qAlign[l] == '-'){
466 rDiffs++;
467 }
468 else if(qAlign[l] != '.'){
469
470 if(rAlign[l] == '-'){
471 rDiffs++;
472 }
473 else if(qAlign[l] != rAlign[l]){;// && rAlign[l] != '.'){
474 rDiffs++;
475 }
476
477 rightDiffs[rCount] = rDiffs;
478 rightMap[rCount] = l;
479 rCount++;
480 }
481
482 }
483
484 return 0;
485 }
486 catch(exception& e) {
487 m->errorOut(e, "RefChimeraTest", "getUnalignedDiffs");
488 exit(1);
489 }
490 }
491
492 /**************************************************************************************************/
493
getAlignedMismatches(string & querySeq,vector<vector<int>> & left,vector<vector<int>> & right,int & bestRefSeq)494 int RefChimeraTest::getAlignedMismatches(string& querySeq, vector<vector<int> >& left, vector<vector<int> >& right, int& bestRefSeq){
495
496 int bestSequenceMismatch = MAXINT;
497
498 for(int i=0;i<numRefSeqs;i++){
499
500 int lDiffs = 0;
501
502 for(int l=0;l<alignLength;l++){
503 if(querySeq[l] != '.' && referenceSeqs[i][l] != '.' && querySeq[l] != referenceSeqs[i][l] && referenceSeqs[i][l] != 'N'){
504 lDiffs++;
505 }
506 left[i][l] = lDiffs;
507 }
508
509 int rDiffs = 0;
510 int index = 0;
511 for(int l=alignLength-1;l>=0;l--){
512 if(querySeq[l] != '.' && referenceSeqs[i][l] != '.' && querySeq[l] != referenceSeqs[i][l] && referenceSeqs[i][l] != 'N'){
513 rDiffs++;
514 }
515 right[i][index++] = rDiffs;
516 }
517
518 if(lDiffs < bestSequenceMismatch){
519 bestSequenceMismatch = lDiffs;
520 bestRefSeq = i;
521 }
522 }
523 return bestSequenceMismatch;
524 }
525
526 /**************************************************************************************************/
527
getChimera(vector<vector<int>> & left,vector<vector<int>> & right,int & leftParent,int & rightParent,int & breakPoint,vector<int> & singleLeft,vector<int> & bestLeft,vector<int> & singleRight,vector<int> & bestRight)528 int RefChimeraTest::getChimera(vector<vector<int> >& left, vector<vector<int> >& right, int& leftParent, int& rightParent, int& breakPoint, vector<int>& singleLeft, vector<int>& bestLeft, vector<int>& singleRight, vector<int>& bestRight){
529
530 singleLeft.resize(alignLength, MAXINT);
531 bestLeft.resize(alignLength, -1);
532
533 for(int l=0;l<alignLength;l++){
534 for(int i=0;i<numRefSeqs;i++){
535 if(left[i][l] <= singleLeft[l]){
536 singleLeft[l] = left[i][l];
537 bestLeft[l] = i;
538 }
539 }
540 }
541
542 singleRight.resize(alignLength, MAXINT);
543 bestRight.resize(alignLength, -1);
544
545 for(int l=0;l<alignLength;l++){
546 for(int i=0;i<numRefSeqs;i++){
547 if(right[i][l] <= singleRight[l]){
548 singleRight[l] = right[i][l];
549 bestRight[l] = i;
550 }
551 }
552 }
553
554 int bestChimeraMismatches = MAXINT;
555 leftParent = -1;
556 rightParent = -1;
557 breakPoint = -1;
558
559 for(int l=0;l<alignLength;l++){
560 int chimera = singleLeft[l] + singleRight[alignLength - l - 1];
561 if(chimera < bestChimeraMismatches){
562 bestChimeraMismatches = chimera;
563 breakPoint = l;
564 leftParent = bestLeft[l];
565 rightParent = bestRight[alignLength - l - 1];
566 }
567 }
568
569 return bestChimeraMismatches;
570 }
571
572 /**************************************************************************************************/
573
getTrimera(vector<vector<int>> & left,vector<vector<int>> & right,int & leftParent,int & middleParent,int & rightParent,int & breakPointA,int & breakPointB,vector<int> & singleLeft,vector<int> & bestLeft,vector<int> & singleRight,vector<int> & bestRight)574 int RefChimeraTest::getTrimera(vector<vector<int> >& left, vector<vector<int> >& right, int& leftParent, int& middleParent, int& rightParent, int& breakPointA, int& breakPointB, vector<int>& singleLeft, vector<int>& bestLeft, vector<int>& singleRight, vector<int>& bestRight){
575
576 int bestTrimeraMismatches = MAXINT;
577
578 leftParent = -1;
579 middleParent = -1;
580 rightParent = -1;
581
582 breakPointA = -1;
583 breakPointB = -1;
584
585 vector<vector<int> > minDelta; minDelta.resize(alignLength);
586 vector<vector<int> > minDeltaSeq; minDeltaSeq.resize(alignLength);
587
588 for(int i=0;i<alignLength;i++){
589 minDelta[i].assign(alignLength, MAXINT);
590 minDeltaSeq[i].assign(alignLength, -1);
591 }
592
593 for(int x=0;x<alignLength;x++){
594 for(int y=x;y<alignLength-1;y++){
595
596 for(int i=0;i<numRefSeqs;i++){
597 int delta = left[i][y] - left[i][x];
598
599 if(delta <= minDelta[x][y]){
600 minDelta[x][y] = delta;
601 minDeltaSeq[x][y] = i;
602 }
603 }
604 minDelta[x][y] += singleLeft[x] + singleRight[alignLength - y - 2];
605
606 if(minDelta[x][y] < bestTrimeraMismatches){
607 bestTrimeraMismatches = minDelta[x][y];
608
609 breakPointA = x;
610 breakPointB = y;
611
612 leftParent = bestLeft[x];
613 middleParent = minDeltaSeq[x][y];
614 rightParent = bestRight[alignLength - y - 2];
615 }
616 }
617 }
618 return bestTrimeraMismatches;
619 }
620
621 /**************************************************************************************************/
622
stitchBimera(int leftParent,int rightParent,int breakPoint)623 string RefChimeraTest::stitchBimera(int leftParent, int rightParent, int breakPoint){
624
625 string chimeraRefSeq = referenceSeqs[leftParent].substr(0, breakPoint) + referenceSeqs[rightParent].substr(breakPoint);
626 return chimeraRefSeq;
627
628 }
629
630 /**************************************************************************************************/
631
stitchTrimera(int leftParent,int middleParent,int rightParent,int breakPointA,int breakPointB)632 string RefChimeraTest::stitchTrimera(int leftParent, int middleParent, int rightParent, int breakPointA, int breakPointB){
633
634 string chimeraRefSeq = referenceSeqs[leftParent].substr(0, breakPointA) + referenceSeqs[middleParent].substr(breakPointA, breakPointB-breakPointA) + referenceSeqs[rightParent].substr(breakPointB);
635
636 return chimeraRefSeq;
637 }
638
639 /**************************************************************************************************/
640
calcDistToChimera(string & querySeq,string & chimeraRefSeq)641 double RefChimeraTest::calcDistToChimera(string& querySeq, string& chimeraRefSeq){
642
643 int match = 0;
644 int mismatch = 0;
645
646 for(int i=0;i<alignLength;i++){
647 // if(querySeq[i] != '.' && chimeraRefSeq[i] != '.'){
648 if(chimeraRefSeq[i] != '.' && querySeq[i] != '.'){
649 if(querySeq[i] == '-' && chimeraRefSeq[i] == '-' && chimeraRefSeq[i] != 'N'){ /* do nothing */ }
650 else if(querySeq[i] == chimeraRefSeq[i]){
651 match++;
652 }
653 else{
654 mismatch++;
655 }
656 }
657 }
658
659 return (double)mismatch / (double)(mismatch + match);
660 }
661
662 //***************************************************************************************************************
663
getClosestRefIndex()664 int RefChimeraTest::getClosestRefIndex(){
665
666 return bestMatch;
667
668 }
669
670 //***************************************************************************************************************
671
getQueryAlignment()672 string RefChimeraTest::getQueryAlignment(){
673
674 return bestQueryAlignment;
675
676 }
677
678 //***************************************************************************************************************
679
getClosestRefAlignment()680 string RefChimeraTest::getClosestRefAlignment(){
681
682 return bestRefAlignment;
683
684 }
685
686 //***************************************************************************************************************
687