1 /*
2  *  maligner.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 9/23/09.
6  *  Copyright 2009 Schloss Lab. All rights reserved.
7  *
8  */
9 
10 #include "maligner.h"
11 
12 /***********************************************************************/ //int num, int match, int misMatch, , string mode, Database* dataLeft, Database* dataRight
Maligner(vector<Sequence> temp,int match,int misMatch,float div,int ms,int minCov)13 Maligner::Maligner(vector<Sequence> temp, int match, int misMatch, float div, int ms, int minCov) : db(temp), matchScore(match), misMatchPenalty(misMatch), minDivR(div), minSimilarity(ms), minCoverage(minCov) {
14 			//numWanted(num),  , searchMethod(mode), databaseLeft(dataLeft), databaseRight(dataRight)
15 
16 			m = MothurOut::getInstance();
17 
18 }
19 /***********************************************************************/
getResults(Sequence q,DeCalculator decalc)20 string Maligner::getResults(Sequence q, DeCalculator decalc) {
21 	try {
22 
23 		outputResults.clear();
24 
25 		//make copy so trimming doesn't destroy query from calling class - remember to deallocate
26 		query.setName(q.getName()); query.setAligned(q.getAligned());
27 
28 		string chimera;
29 
30 		//copy refSeqs so that filter does not effect original
31 		for(int i = 0; i < db.size(); i++) {
32 			Sequence newSeq(db[i].getName(), db[i].getAligned());
33 			refSeqs.push_back(newSeq);
34 		}
35 
36 		refSeqs = minCoverageFilter(refSeqs);
37 
38 		if (refSeqs.size() < 2)  {
39 			//for (int i = 0; i < refSeqs.size(); i++) {  delete refSeqs[i];	}
40 			percentIdenticalQueryChimera = 0.0;
41 			return "unknown";
42 		}
43 
44 		int chimeraPenalty = computeChimeraPenalty();
45 
46 		//fills outputResults
47 		chimera = chimeraMaligner(chimeraPenalty, decalc);
48 
49 		if (m->getControl_pressed()) { return chimera;  }
50 
51 		//free memory
52 		//delete query;
53 
54 		//for (int i = 0; i < refSeqs.size(); i++) {  delete refSeqs[i];	}
55 
56 		return chimera;
57 	}
58 	catch(exception& e) {
59 		m->errorOut(e, "Maligner", "getResults");
60 		exit(1);
61 	}
62 }
63 /***********************************************************************/
chimeraMaligner(int chimeraPenalty,DeCalculator decalc)64 string Maligner::chimeraMaligner(int chimeraPenalty, DeCalculator decalc) {
65 	try {
66 
67 		string chimera;
68 		//trims seqs to first non gap char in all seqs and last non gap char in all seqs
69 		spotMap = decalc.trimSeqs(query, refSeqs);
70 
71 		//you trimmed the whole sequence, skip
72 		if (query.getAligned() == "") { return "no"; }
73 
74 		vector<Sequence> temp = refSeqs;
75 		temp.push_back(query);
76 
77 		temp = verticalFilter(temp);
78 		query = temp[temp.size()-1];
79 		for (int i = 0; i < temp.size()-1; i++) {  refSeqs[i] = temp[i]; }
80 
81 		vector< vector<score_struct> > matrix = buildScoreMatrix(query.getAligned().length(), refSeqs.size()); //builds and initializes
82 
83 		if (m->getControl_pressed()) { return chimera;  }
84 
85 		fillScoreMatrix(matrix, refSeqs, chimeraPenalty);
86 
87 		vector<score_struct> path = extractHighestPath(matrix);
88 
89 		if (m->getControl_pressed()) { return chimera;  }
90 
91 		vector<trace_struct> trace = mapTraceRegionsToAlignment(path);
92 
93 		if (trace.size() > 1) {		chimera = "yes";	}
94 		else { chimera = "no";	return chimera; }
95 
96 		int traceStart = path[0].col;
97 		int traceEnd = path[path.size()-1].col;
98 		string queryInRange = query.getAligned();
99 		queryInRange = queryInRange.substr(traceStart, (traceEnd-traceStart+1));
100 
101 		string chimeraSeq = constructChimericSeq(trace, refSeqs);
102 
103 		percentIdenticalQueryChimera = computePercentID(queryInRange, chimeraSeq);
104 
105 		if (m->getControl_pressed()) { return chimera;  }
106 
107 		//save output results
108 		for (int i = 0; i < trace.size(); i++) {
109 			int regionStart = trace[i].col;
110 			int regionEnd = trace[i].oldCol;
111 			int seqIndex = trace[i].row;
112 
113 			results temp;
114 
115 			temp.parent = refSeqs[seqIndex].getName();
116 			temp.parentAligned = db[seqIndex].getAligned();
117 			temp.nastRegionStart = spotMap[regionStart];
118 			temp.nastRegionEnd = spotMap[regionEnd];
119 			temp.regionStart = unalignedMap[regionStart];
120 			temp.regionEnd = unalignedMap[regionEnd];
121 
122 			string parentInRange = refSeqs[seqIndex].getAligned();
123 			parentInRange = parentInRange.substr(traceStart, (traceEnd-traceStart+1));
124 
125 			temp.queryToParent = computePercentID(queryInRange, parentInRange);
126 			temp.divR = (percentIdenticalQueryChimera / temp.queryToParent);
127 
128 			string queryInRegion = query.getAligned();
129 			queryInRegion = queryInRegion.substr(regionStart, (regionEnd-regionStart+1));
130 
131 			string parentInRegion = refSeqs[seqIndex].getAligned();
132 			parentInRegion = parentInRegion.substr(regionStart, (regionEnd-regionStart+1));
133 
134 			temp.queryToParentLocal = computePercentID(queryInRegion, parentInRegion);
135 
136 			outputResults.push_back(temp);
137 		}
138 
139 		return chimera;
140 	}
141 	catch(exception& e) {
142 		m->errorOut(e, "Maligner", "chimeraMaligner");
143 		exit(1);
144 	}
145 }
146 /***********************************************************************/
147 //removes top matches that do not have minimum coverage with query.
minCoverageFilter(vector<Sequence> ref)148 vector<Sequence> Maligner::minCoverageFilter(vector<Sequence> ref){
149 	try {
150 		vector<Sequence> newRefs;
151 
152 		string queryAligned = query.getAligned();
153 
154 		for (int i = 0; i < ref.size(); i++) {
155 
156 			string refAligned = ref[i].getAligned();
157 
158 			int numBases = 0;
159 			int numCovered = 0;
160 
161 			//calculate coverage
162 			for (int j = 0; j < queryAligned.length(); j++) {
163 
164 				if (isalpha(queryAligned[j])) {
165 					numBases++;
166 
167 					if (isalpha(refAligned[j])) {
168 						numCovered++;
169 					}
170 				}
171 			}
172 
173 			int coverage = ((numCovered/(float)numBases)*100);
174 
175 			//if coverage above minimum
176 			if (coverage > minCoverage) {
177 				newRefs.push_back(ref[i]);
178 			}
179 		}
180 
181 		return newRefs;
182 	}
183 	catch(exception& e) {
184 		m->errorOut(e, "Maligner", "minCoverageFilter");
185 		exit(1);
186 	}
187 }
188 /***********************************************************************/
189 // a breakpoint should yield fewer mismatches than this number with respect to the best parent sequence.
computeChimeraPenalty()190 int Maligner::computeChimeraPenalty() {
191 	try {
192 
193 		int numAllowable = ((1.0 - (1.0/minDivR)) * query.getNumBases());
194 
195 		int penalty = int(numAllowable + 1) * misMatchPenalty;
196 
197 		return penalty;
198 
199 	}
200 	catch(exception& e) {
201 		m->errorOut(e, "Maligner", "computeChimeraPenalty");
202 		exit(1);
203 	}
204 }
205 /***********************************************************************/
206 //this is a vertical filter
verticalFilter(vector<Sequence> seqs)207 vector<Sequence> Maligner::verticalFilter(vector<Sequence> seqs) {
208 	try {
209 		vector<int> gaps;	gaps.resize(query.getAligned().length(), 0);
210 
211 		string filterString = (string(query.getAligned().length(), '1'));
212 
213 		//for each sequence
214 		for (int i = 0; i < seqs.size(); i++) {
215 
216 			string seqAligned = seqs[i].getAligned();
217 
218 			for (int j = 0; j < seqAligned.length(); j++) {
219 				//if this spot is a gap
220 				if ((seqAligned[j] == '-') || (seqAligned[j] == '.'))	{	gaps[j]++;	}
221 			}
222 		}
223 
224 		//zero out spot where all sequences have blanks
225 		int numColRemoved = 0;
226 		for(int i = 0; i < seqs[0].getAligned().length(); i++){
227 			if(gaps[i] == seqs.size())	{	filterString[i] = '0'; 	numColRemoved++;  }
228 		}
229 
230 		map<int, int> newMap;
231 		//for each sequence
232 		for (int i = 0; i < seqs.size(); i++) {
233 
234 			string seqAligned = seqs[i].getAligned();
235 			string newAligned = "";
236 			int count = 0;
237 
238 			for (int j = 0; j < seqAligned.length(); j++) {
239 				//if this spot is not a gap
240 				if (filterString[j] == '1') {
241 					newAligned += seqAligned[j];
242 					newMap[count] = spotMap[j];
243 					count++;
244 				}
245 			}
246 
247 			seqs[i].setAligned(newAligned);
248 		}
249 
250 		string query = seqs[seqs.size()-1].getAligned();
251 		int queryLength = query.length();
252 
253 		unalignedMap.resize(queryLength, 0);
254 
255 
256 		for(int i=1;i<queryLength;i++){
257 			if(query[i] != '.' && query[i] != '-'){
258 				unalignedMap[i] = unalignedMap[i-1] + 1;
259 			}
260 			else{
261 				unalignedMap[i] = unalignedMap[i-1];
262 			}
263 		}
264 
265 		spotMap = newMap;
266 
267 		return seqs;
268 	}
269 	catch(exception& e) {
270 		m->errorOut(e, "Maligner", "verticalFilter");
271 		exit(1);
272 	}
273 }
274 //***************************************************************************************************************
buildScoreMatrix(int cols,int rows)275 vector< vector<score_struct> > Maligner::buildScoreMatrix(int cols, int rows) {
276 	try{
277 
278 		vector< vector<score_struct> > m(rows);
279 
280 		for (int i = 0; i < rows; i++) {
281 			for (int j = 0; j < cols; j++) {
282 
283 				//initialize each cell
284 				score_struct temp;
285 				temp.prev = -1;
286 				temp.score = -9999999;
287 				temp.col = j;
288 				temp.row = i;
289 
290 				m[i].push_back(temp);
291 			}
292 		}
293 
294 		return m;
295 	}
296 	catch(exception& e) {
297 		m->errorOut(e, "Maligner", "buildScoreMatrix");
298 		exit(1);
299 	}
300 }
301 
302 //***************************************************************************************************************
303 
fillScoreMatrix(vector<vector<score_struct>> & ms,vector<Sequence> seqs,int penalty)304 void Maligner::fillScoreMatrix(vector<vector<score_struct> >& ms, vector<Sequence> seqs, int penalty) {
305 	try{
306 
307 		//get matrix dimensions
308 		int numCols = query.getAligned().length();
309 		int numRows = seqs.size();
310 
311 		//initialize first col
312 		string queryAligned = query.getAligned();
313 		for (int i = 0; i < numRows; i++) {
314 			string subjectAligned = seqs[i].getAligned();
315 
316 			//are you both gaps?
317 			if ((!isalpha(queryAligned[0])) && (!isalpha(subjectAligned[0]))) {
318 				ms[i][0].score = 0;
319 //				ms[i][0].mismatches = 0;
320 			}else if (queryAligned[0] == subjectAligned[0])  { //|| subjectAligned[0] == 'N')
321 				ms[i][0].score = matchScore;
322 //				ms[i][0].mismatches = 0;
323 			}else{
324 				ms[i][0].score = 0;
325 //				ms[i][0].mismatches = 1;
326 			}
327 		}
328 
329 		//fill rest of matrix
330 		for (int j = 1; j < numCols; j++) {  //iterate through matrix columns
331 
332 //			for (int i = 0; i < 1; i++) {  //iterate through matrix rows
333 			for (int i = 0; i < numRows; i++) {  //iterate through matrix rows
334 
335 				string subjectAligned = seqs[i].getAligned();
336 
337 				int matchMisMatchScore = 0;
338 				//are you both gaps?
339 				if ((!isalpha(queryAligned[j])) && (!isalpha(subjectAligned[j]))) {
340 					//leave the same
341 				}else if ((toupper(queryAligned[j]) == 'N') || (toupper(subjectAligned[j]) == 'N')) {
342 					//leave the same
343 				}else if (queryAligned[j] == subjectAligned[j]) {
344 					matchMisMatchScore = matchScore;
345 				}else if (queryAligned[j] != subjectAligned[j]) {
346 					matchMisMatchScore = misMatchPenalty;
347 				}
348 
349 				//compute score based on previous columns scores
350 				for (int prevIndex = 0; prevIndex < numRows; prevIndex++) { //iterate through rows
351 
352 					int sumScore = matchMisMatchScore + ms[prevIndex][j-1].score;
353 
354 					//you are not at yourself
355 					if (prevIndex != i) {   sumScore += penalty;	}
356 					if (sumScore < 0)	{	sumScore = 0;			}
357 
358 					if (sumScore > ms[i][j].score) {
359 						ms[i][j].score = sumScore;
360 						ms[i][j].prev = prevIndex;
361 					}
362 				}
363 			}
364 		}
365 
366 	}
367 	catch(exception& e) {
368 		m->errorOut(e, "Maligner", "fillScoreMatrix");
369 		exit(1);
370 	}
371 }
372 //***************************************************************************************************************
extractHighestPath(vector<vector<score_struct>> ms)373 vector<score_struct> Maligner::extractHighestPath(vector<vector<score_struct> > ms) {
374 	try {
375 
376 		//get matrix dimensions
377 		int numCols = query.getAligned().length();
378 		int numRows = ms.size();
379 
380 
381 		//find highest score scoring matrix
382 		score_struct highestStruct;
383 		int highestScore = 0;
384 
385 		for (int i = 0; i < numRows; i++) {
386 			for (int j = 0; j < numCols; j++) {
387 				if (ms[i][j].score > highestScore) {
388 					highestScore = ms[i][j].score;
389 					highestStruct = ms[i][j];
390 				}
391 			}
392 		}
393 
394 		vector<score_struct> path;
395 
396 		int rowIndex = highestStruct.row;
397 		int pos = highestStruct.col;
398 		int score = highestStruct.score;
399 
400 		while (pos >= 0 && score > 0) {
401 			score_struct temp = ms[rowIndex][pos];
402 			score = temp.score;
403 
404 			if (score > 0) {	path.push_back(temp);	}
405 
406 			rowIndex = temp.prev;
407 			pos--;
408 		}
409 
410 		reverse(path.begin(), path.end());
411 
412 		return path;
413 
414 	}
415 	catch(exception& e) {
416 		m->errorOut(e, "Maligner", "extractHighestPath");
417 		exit(1);
418 	}
419 }
420 //***************************************************************************************************************
mapTraceRegionsToAlignment(vector<score_struct> path)421 vector<trace_struct> Maligner::mapTraceRegionsToAlignment(vector<score_struct> path) {
422 	try {
423 		vector<trace_struct> trace;
424 
425 		int region_index = path[0].row;
426 		int region_start = path[0].col;
427 
428 		for (int i = 1; i < path.size(); i++) {
429 
430 			int next_region_index = path[i].row;
431 
432 			if (next_region_index != region_index) {
433 
434 				// add trace region
435 				int col_index = path[i].col;
436 				trace_struct temp;
437 				temp.col = region_start;
438 				temp.oldCol = col_index-1;
439 				temp.row = region_index;
440 
441 				trace.push_back(temp);
442 
443 				region_index = path[i].row;
444 				region_start = col_index;
445 			}
446 		}
447 
448 		// get last one
449 		trace_struct temp;
450 		temp.col = region_start;
451 		temp.oldCol = path[path.size()-1].col;
452 		temp.row = region_index;
453 		trace.push_back(temp);
454 
455 		return trace;
456 
457 	}
458 	catch(exception& e) {
459 		m->errorOut(e, "Maligner", "mapTraceRegionsToAlignment");
460 		exit(1);
461 	}
462 }
463 //***************************************************************************************************************
464 
constructChimericSeq(vector<trace_struct> trace,vector<Sequence> seqs)465 string Maligner::constructChimericSeq(vector<trace_struct> trace, vector<Sequence> seqs) {
466 	try {
467 		string chimera = "";
468 
469 		for (int i = 0; i < trace.size(); i++) {
470 			string seqAlign = seqs[trace[i].row].getAligned();
471 			seqAlign = seqAlign.substr(trace[i].col, (trace[i].oldCol-trace[i].col+1));
472 			chimera += seqAlign;
473 		}
474 
475 		return chimera;
476 	}
477 	catch(exception& e) {
478 		m->errorOut(e, "Maligner", "constructChimericSeq");
479 		exit(1);
480 	}
481 }
482 
483 //***************************************************************************************************************
484 
constructAntiChimericSeq(vector<trace_struct> trace,vector<Sequence> seqs)485 string Maligner::constructAntiChimericSeq(vector<trace_struct> trace, vector<Sequence> seqs) {
486 	try {
487 		string antiChimera = "";
488 
489 		for (int i = 0; i < trace.size(); i++) {
490 
491 			int oppositeIndex = trace.size() - i - 1;
492 
493 			string seqAlign = seqs[trace[oppositeIndex].row].getAligned();
494 			seqAlign = seqAlign.substr(trace[i].col, (trace[i].oldCol-trace[i].col+1));
495 			antiChimera += seqAlign;
496 		}
497 
498 		return antiChimera;
499 	}
500 	catch(exception& e) {
501 		m->errorOut(e, "Maligner", "constructChimericSeq");
502 		exit(1);
503 	}
504 }
505 
506 //***************************************************************************************************************
computePercentID(string queryAlign,string chimera)507 float Maligner::computePercentID(string queryAlign, string chimera) {
508 	try {
509 
510 		if (queryAlign.length() != chimera.length()) {
511 			m->mothurOut("Error, alignment strings are of different lengths: \n");
512 			m->mothurOut(toString(queryAlign.length())+ "\n");
513 			m->mothurOut(toString(chimera.length())+ "\n");
514 			return -1.0;
515 		}
516 
517 
518 		int numIdentical = 0;
519 		int countA = 0;
520 		int countB = 0;
521 		for (int i = 0; i < queryAlign.length(); i++) {
522 			if (((queryAlign[i] != 'G') && (queryAlign[i] != 'T') && (queryAlign[i] != 'A') && (queryAlign[i] != 'C')&& (queryAlign[i] != '.') && (queryAlign[i] != '-')) ||
523 				((chimera[i] != 'G') && (chimera[i] != 'T') && (chimera[i] != 'A') && (chimera[i] != 'C')&& (chimera[i] != '.') && (chimera[i] != '-'))) {}
524 			else {
525 
526 				bool charA = false; bool charB = false;
527 				if ((queryAlign[i] == 'G') || (queryAlign[i] == 'T') || (queryAlign[i] == 'A') || (queryAlign[i] == 'C')) { charA = true; }
528 				if ((chimera[i] == 'G') || (chimera[i] == 'T') || (chimera[i] == 'A') || (chimera[i] == 'C')) { charB = true; }
529 
530 
531 				if (charA || charB) {
532 
533 					if (charA) { countA++; }
534 					if (charB) { countB++; }
535 
536 					if (queryAlign[i] == chimera[i]) {
537 						numIdentical++;
538 					}
539 				}
540 
541 
542 			}
543 		}
544 
545 		float numBases = (countA + countB) /(float) 2;
546 
547 		if (numBases == 0) { return 0; }
548 
549 		float percentIdentical = (numIdentical/(float)numBases) * 100;
550 
551 		return percentIdentical;
552 
553 	}
554 	catch(exception& e) {
555 		m->errorOut(e, "Maligner", "computePercentID");
556 		exit(1);
557 	}
558 }
559 //***************************************************************************************************************
560