1 /*
2 * maligner.cpp
3 * Mothur
4 *
5 * Created by westcott on 9/23/09.
6 * Copyright 2009 Schloss Lab. All rights reserved.
7 *
8 */
9
10 #include "maligner.h"
11
12 /***********************************************************************/ //int num, int match, int misMatch, , string mode, Database* dataLeft, Database* dataRight
Maligner(vector<Sequence> temp,int match,int misMatch,float div,int ms,int minCov)13 Maligner::Maligner(vector<Sequence> temp, int match, int misMatch, float div, int ms, int minCov) : db(temp), matchScore(match), misMatchPenalty(misMatch), minDivR(div), minSimilarity(ms), minCoverage(minCov) {
14 //numWanted(num), , searchMethod(mode), databaseLeft(dataLeft), databaseRight(dataRight)
15
16 m = MothurOut::getInstance();
17
18 }
19 /***********************************************************************/
getResults(Sequence q,DeCalculator decalc)20 string Maligner::getResults(Sequence q, DeCalculator decalc) {
21 try {
22
23 outputResults.clear();
24
25 //make copy so trimming doesn't destroy query from calling class - remember to deallocate
26 query.setName(q.getName()); query.setAligned(q.getAligned());
27
28 string chimera;
29
30 //copy refSeqs so that filter does not effect original
31 for(int i = 0; i < db.size(); i++) {
32 Sequence newSeq(db[i].getName(), db[i].getAligned());
33 refSeqs.push_back(newSeq);
34 }
35
36 refSeqs = minCoverageFilter(refSeqs);
37
38 if (refSeqs.size() < 2) {
39 //for (int i = 0; i < refSeqs.size(); i++) { delete refSeqs[i]; }
40 percentIdenticalQueryChimera = 0.0;
41 return "unknown";
42 }
43
44 int chimeraPenalty = computeChimeraPenalty();
45
46 //fills outputResults
47 chimera = chimeraMaligner(chimeraPenalty, decalc);
48
49 if (m->getControl_pressed()) { return chimera; }
50
51 //free memory
52 //delete query;
53
54 //for (int i = 0; i < refSeqs.size(); i++) { delete refSeqs[i]; }
55
56 return chimera;
57 }
58 catch(exception& e) {
59 m->errorOut(e, "Maligner", "getResults");
60 exit(1);
61 }
62 }
63 /***********************************************************************/
chimeraMaligner(int chimeraPenalty,DeCalculator decalc)64 string Maligner::chimeraMaligner(int chimeraPenalty, DeCalculator decalc) {
65 try {
66
67 string chimera;
68 //trims seqs to first non gap char in all seqs and last non gap char in all seqs
69 spotMap = decalc.trimSeqs(query, refSeqs);
70
71 //you trimmed the whole sequence, skip
72 if (query.getAligned() == "") { return "no"; }
73
74 vector<Sequence> temp = refSeqs;
75 temp.push_back(query);
76
77 temp = verticalFilter(temp);
78 query = temp[temp.size()-1];
79 for (int i = 0; i < temp.size()-1; i++) { refSeqs[i] = temp[i]; }
80
81 vector< vector<score_struct> > matrix = buildScoreMatrix(query.getAligned().length(), refSeqs.size()); //builds and initializes
82
83 if (m->getControl_pressed()) { return chimera; }
84
85 fillScoreMatrix(matrix, refSeqs, chimeraPenalty);
86
87 vector<score_struct> path = extractHighestPath(matrix);
88
89 if (m->getControl_pressed()) { return chimera; }
90
91 vector<trace_struct> trace = mapTraceRegionsToAlignment(path);
92
93 if (trace.size() > 1) { chimera = "yes"; }
94 else { chimera = "no"; return chimera; }
95
96 int traceStart = path[0].col;
97 int traceEnd = path[path.size()-1].col;
98 string queryInRange = query.getAligned();
99 queryInRange = queryInRange.substr(traceStart, (traceEnd-traceStart+1));
100
101 string chimeraSeq = constructChimericSeq(trace, refSeqs);
102
103 percentIdenticalQueryChimera = computePercentID(queryInRange, chimeraSeq);
104
105 if (m->getControl_pressed()) { return chimera; }
106
107 //save output results
108 for (int i = 0; i < trace.size(); i++) {
109 int regionStart = trace[i].col;
110 int regionEnd = trace[i].oldCol;
111 int seqIndex = trace[i].row;
112
113 results temp;
114
115 temp.parent = refSeqs[seqIndex].getName();
116 temp.parentAligned = db[seqIndex].getAligned();
117 temp.nastRegionStart = spotMap[regionStart];
118 temp.nastRegionEnd = spotMap[regionEnd];
119 temp.regionStart = unalignedMap[regionStart];
120 temp.regionEnd = unalignedMap[regionEnd];
121
122 string parentInRange = refSeqs[seqIndex].getAligned();
123 parentInRange = parentInRange.substr(traceStart, (traceEnd-traceStart+1));
124
125 temp.queryToParent = computePercentID(queryInRange, parentInRange);
126 temp.divR = (percentIdenticalQueryChimera / temp.queryToParent);
127
128 string queryInRegion = query.getAligned();
129 queryInRegion = queryInRegion.substr(regionStart, (regionEnd-regionStart+1));
130
131 string parentInRegion = refSeqs[seqIndex].getAligned();
132 parentInRegion = parentInRegion.substr(regionStart, (regionEnd-regionStart+1));
133
134 temp.queryToParentLocal = computePercentID(queryInRegion, parentInRegion);
135
136 outputResults.push_back(temp);
137 }
138
139 return chimera;
140 }
141 catch(exception& e) {
142 m->errorOut(e, "Maligner", "chimeraMaligner");
143 exit(1);
144 }
145 }
146 /***********************************************************************/
147 //removes top matches that do not have minimum coverage with query.
minCoverageFilter(vector<Sequence> ref)148 vector<Sequence> Maligner::minCoverageFilter(vector<Sequence> ref){
149 try {
150 vector<Sequence> newRefs;
151
152 string queryAligned = query.getAligned();
153
154 for (int i = 0; i < ref.size(); i++) {
155
156 string refAligned = ref[i].getAligned();
157
158 int numBases = 0;
159 int numCovered = 0;
160
161 //calculate coverage
162 for (int j = 0; j < queryAligned.length(); j++) {
163
164 if (isalpha(queryAligned[j])) {
165 numBases++;
166
167 if (isalpha(refAligned[j])) {
168 numCovered++;
169 }
170 }
171 }
172
173 int coverage = ((numCovered/(float)numBases)*100);
174
175 //if coverage above minimum
176 if (coverage > minCoverage) {
177 newRefs.push_back(ref[i]);
178 }
179 }
180
181 return newRefs;
182 }
183 catch(exception& e) {
184 m->errorOut(e, "Maligner", "minCoverageFilter");
185 exit(1);
186 }
187 }
188 /***********************************************************************/
189 // a breakpoint should yield fewer mismatches than this number with respect to the best parent sequence.
computeChimeraPenalty()190 int Maligner::computeChimeraPenalty() {
191 try {
192
193 int numAllowable = ((1.0 - (1.0/minDivR)) * query.getNumBases());
194
195 int penalty = int(numAllowable + 1) * misMatchPenalty;
196
197 return penalty;
198
199 }
200 catch(exception& e) {
201 m->errorOut(e, "Maligner", "computeChimeraPenalty");
202 exit(1);
203 }
204 }
205 /***********************************************************************/
206 //this is a vertical filter
verticalFilter(vector<Sequence> seqs)207 vector<Sequence> Maligner::verticalFilter(vector<Sequence> seqs) {
208 try {
209 vector<int> gaps; gaps.resize(query.getAligned().length(), 0);
210
211 string filterString = (string(query.getAligned().length(), '1'));
212
213 //for each sequence
214 for (int i = 0; i < seqs.size(); i++) {
215
216 string seqAligned = seqs[i].getAligned();
217
218 for (int j = 0; j < seqAligned.length(); j++) {
219 //if this spot is a gap
220 if ((seqAligned[j] == '-') || (seqAligned[j] == '.')) { gaps[j]++; }
221 }
222 }
223
224 //zero out spot where all sequences have blanks
225 int numColRemoved = 0;
226 for(int i = 0; i < seqs[0].getAligned().length(); i++){
227 if(gaps[i] == seqs.size()) { filterString[i] = '0'; numColRemoved++; }
228 }
229
230 map<int, int> newMap;
231 //for each sequence
232 for (int i = 0; i < seqs.size(); i++) {
233
234 string seqAligned = seqs[i].getAligned();
235 string newAligned = "";
236 int count = 0;
237
238 for (int j = 0; j < seqAligned.length(); j++) {
239 //if this spot is not a gap
240 if (filterString[j] == '1') {
241 newAligned += seqAligned[j];
242 newMap[count] = spotMap[j];
243 count++;
244 }
245 }
246
247 seqs[i].setAligned(newAligned);
248 }
249
250 string query = seqs[seqs.size()-1].getAligned();
251 int queryLength = query.length();
252
253 unalignedMap.resize(queryLength, 0);
254
255
256 for(int i=1;i<queryLength;i++){
257 if(query[i] != '.' && query[i] != '-'){
258 unalignedMap[i] = unalignedMap[i-1] + 1;
259 }
260 else{
261 unalignedMap[i] = unalignedMap[i-1];
262 }
263 }
264
265 spotMap = newMap;
266
267 return seqs;
268 }
269 catch(exception& e) {
270 m->errorOut(e, "Maligner", "verticalFilter");
271 exit(1);
272 }
273 }
274 //***************************************************************************************************************
buildScoreMatrix(int cols,int rows)275 vector< vector<score_struct> > Maligner::buildScoreMatrix(int cols, int rows) {
276 try{
277
278 vector< vector<score_struct> > m(rows);
279
280 for (int i = 0; i < rows; i++) {
281 for (int j = 0; j < cols; j++) {
282
283 //initialize each cell
284 score_struct temp;
285 temp.prev = -1;
286 temp.score = -9999999;
287 temp.col = j;
288 temp.row = i;
289
290 m[i].push_back(temp);
291 }
292 }
293
294 return m;
295 }
296 catch(exception& e) {
297 m->errorOut(e, "Maligner", "buildScoreMatrix");
298 exit(1);
299 }
300 }
301
302 //***************************************************************************************************************
303
fillScoreMatrix(vector<vector<score_struct>> & ms,vector<Sequence> seqs,int penalty)304 void Maligner::fillScoreMatrix(vector<vector<score_struct> >& ms, vector<Sequence> seqs, int penalty) {
305 try{
306
307 //get matrix dimensions
308 int numCols = query.getAligned().length();
309 int numRows = seqs.size();
310
311 //initialize first col
312 string queryAligned = query.getAligned();
313 for (int i = 0; i < numRows; i++) {
314 string subjectAligned = seqs[i].getAligned();
315
316 //are you both gaps?
317 if ((!isalpha(queryAligned[0])) && (!isalpha(subjectAligned[0]))) {
318 ms[i][0].score = 0;
319 // ms[i][0].mismatches = 0;
320 }else if (queryAligned[0] == subjectAligned[0]) { //|| subjectAligned[0] == 'N')
321 ms[i][0].score = matchScore;
322 // ms[i][0].mismatches = 0;
323 }else{
324 ms[i][0].score = 0;
325 // ms[i][0].mismatches = 1;
326 }
327 }
328
329 //fill rest of matrix
330 for (int j = 1; j < numCols; j++) { //iterate through matrix columns
331
332 // for (int i = 0; i < 1; i++) { //iterate through matrix rows
333 for (int i = 0; i < numRows; i++) { //iterate through matrix rows
334
335 string subjectAligned = seqs[i].getAligned();
336
337 int matchMisMatchScore = 0;
338 //are you both gaps?
339 if ((!isalpha(queryAligned[j])) && (!isalpha(subjectAligned[j]))) {
340 //leave the same
341 }else if ((toupper(queryAligned[j]) == 'N') || (toupper(subjectAligned[j]) == 'N')) {
342 //leave the same
343 }else if (queryAligned[j] == subjectAligned[j]) {
344 matchMisMatchScore = matchScore;
345 }else if (queryAligned[j] != subjectAligned[j]) {
346 matchMisMatchScore = misMatchPenalty;
347 }
348
349 //compute score based on previous columns scores
350 for (int prevIndex = 0; prevIndex < numRows; prevIndex++) { //iterate through rows
351
352 int sumScore = matchMisMatchScore + ms[prevIndex][j-1].score;
353
354 //you are not at yourself
355 if (prevIndex != i) { sumScore += penalty; }
356 if (sumScore < 0) { sumScore = 0; }
357
358 if (sumScore > ms[i][j].score) {
359 ms[i][j].score = sumScore;
360 ms[i][j].prev = prevIndex;
361 }
362 }
363 }
364 }
365
366 }
367 catch(exception& e) {
368 m->errorOut(e, "Maligner", "fillScoreMatrix");
369 exit(1);
370 }
371 }
372 //***************************************************************************************************************
extractHighestPath(vector<vector<score_struct>> ms)373 vector<score_struct> Maligner::extractHighestPath(vector<vector<score_struct> > ms) {
374 try {
375
376 //get matrix dimensions
377 int numCols = query.getAligned().length();
378 int numRows = ms.size();
379
380
381 //find highest score scoring matrix
382 score_struct highestStruct;
383 int highestScore = 0;
384
385 for (int i = 0; i < numRows; i++) {
386 for (int j = 0; j < numCols; j++) {
387 if (ms[i][j].score > highestScore) {
388 highestScore = ms[i][j].score;
389 highestStruct = ms[i][j];
390 }
391 }
392 }
393
394 vector<score_struct> path;
395
396 int rowIndex = highestStruct.row;
397 int pos = highestStruct.col;
398 int score = highestStruct.score;
399
400 while (pos >= 0 && score > 0) {
401 score_struct temp = ms[rowIndex][pos];
402 score = temp.score;
403
404 if (score > 0) { path.push_back(temp); }
405
406 rowIndex = temp.prev;
407 pos--;
408 }
409
410 reverse(path.begin(), path.end());
411
412 return path;
413
414 }
415 catch(exception& e) {
416 m->errorOut(e, "Maligner", "extractHighestPath");
417 exit(1);
418 }
419 }
420 //***************************************************************************************************************
mapTraceRegionsToAlignment(vector<score_struct> path)421 vector<trace_struct> Maligner::mapTraceRegionsToAlignment(vector<score_struct> path) {
422 try {
423 vector<trace_struct> trace;
424
425 int region_index = path[0].row;
426 int region_start = path[0].col;
427
428 for (int i = 1; i < path.size(); i++) {
429
430 int next_region_index = path[i].row;
431
432 if (next_region_index != region_index) {
433
434 // add trace region
435 int col_index = path[i].col;
436 trace_struct temp;
437 temp.col = region_start;
438 temp.oldCol = col_index-1;
439 temp.row = region_index;
440
441 trace.push_back(temp);
442
443 region_index = path[i].row;
444 region_start = col_index;
445 }
446 }
447
448 // get last one
449 trace_struct temp;
450 temp.col = region_start;
451 temp.oldCol = path[path.size()-1].col;
452 temp.row = region_index;
453 trace.push_back(temp);
454
455 return trace;
456
457 }
458 catch(exception& e) {
459 m->errorOut(e, "Maligner", "mapTraceRegionsToAlignment");
460 exit(1);
461 }
462 }
463 //***************************************************************************************************************
464
constructChimericSeq(vector<trace_struct> trace,vector<Sequence> seqs)465 string Maligner::constructChimericSeq(vector<trace_struct> trace, vector<Sequence> seqs) {
466 try {
467 string chimera = "";
468
469 for (int i = 0; i < trace.size(); i++) {
470 string seqAlign = seqs[trace[i].row].getAligned();
471 seqAlign = seqAlign.substr(trace[i].col, (trace[i].oldCol-trace[i].col+1));
472 chimera += seqAlign;
473 }
474
475 return chimera;
476 }
477 catch(exception& e) {
478 m->errorOut(e, "Maligner", "constructChimericSeq");
479 exit(1);
480 }
481 }
482
483 //***************************************************************************************************************
484
constructAntiChimericSeq(vector<trace_struct> trace,vector<Sequence> seqs)485 string Maligner::constructAntiChimericSeq(vector<trace_struct> trace, vector<Sequence> seqs) {
486 try {
487 string antiChimera = "";
488
489 for (int i = 0; i < trace.size(); i++) {
490
491 int oppositeIndex = trace.size() - i - 1;
492
493 string seqAlign = seqs[trace[oppositeIndex].row].getAligned();
494 seqAlign = seqAlign.substr(trace[i].col, (trace[i].oldCol-trace[i].col+1));
495 antiChimera += seqAlign;
496 }
497
498 return antiChimera;
499 }
500 catch(exception& e) {
501 m->errorOut(e, "Maligner", "constructChimericSeq");
502 exit(1);
503 }
504 }
505
506 //***************************************************************************************************************
computePercentID(string queryAlign,string chimera)507 float Maligner::computePercentID(string queryAlign, string chimera) {
508 try {
509
510 if (queryAlign.length() != chimera.length()) {
511 m->mothurOut("Error, alignment strings are of different lengths: \n");
512 m->mothurOut(toString(queryAlign.length())+ "\n");
513 m->mothurOut(toString(chimera.length())+ "\n");
514 return -1.0;
515 }
516
517
518 int numIdentical = 0;
519 int countA = 0;
520 int countB = 0;
521 for (int i = 0; i < queryAlign.length(); i++) {
522 if (((queryAlign[i] != 'G') && (queryAlign[i] != 'T') && (queryAlign[i] != 'A') && (queryAlign[i] != 'C')&& (queryAlign[i] != '.') && (queryAlign[i] != '-')) ||
523 ((chimera[i] != 'G') && (chimera[i] != 'T') && (chimera[i] != 'A') && (chimera[i] != 'C')&& (chimera[i] != '.') && (chimera[i] != '-'))) {}
524 else {
525
526 bool charA = false; bool charB = false;
527 if ((queryAlign[i] == 'G') || (queryAlign[i] == 'T') || (queryAlign[i] == 'A') || (queryAlign[i] == 'C')) { charA = true; }
528 if ((chimera[i] == 'G') || (chimera[i] == 'T') || (chimera[i] == 'A') || (chimera[i] == 'C')) { charB = true; }
529
530
531 if (charA || charB) {
532
533 if (charA) { countA++; }
534 if (charB) { countB++; }
535
536 if (queryAlign[i] == chimera[i]) {
537 numIdentical++;
538 }
539 }
540
541
542 }
543 }
544
545 float numBases = (countA + countB) /(float) 2;
546
547 if (numBases == 0) { return 0; }
548
549 float percentIdentical = (numIdentical/(float)numBases) * 100;
550
551 return percentIdentical;
552
553 }
554 catch(exception& e) {
555 m->errorOut(e, "Maligner", "computePercentID");
556 exit(1);
557 }
558 }
559 //***************************************************************************************************************
560