1 //
2 //  optiblastmatrix.cpp
3 //  Mothur
4 //
5 //  Created by Sarah Westcott on 5/10/18.
6 //  Copyright © 2018 Schloss Lab. All rights reserved.
7 //
8 
9 #include "optiblastmatrix.hpp"
10 
11 /***********************************************************************/
OptiBlastMatrix(string d,string nc,string f,bool s,double c,int l,float p,bool min)12 OptiBlastMatrix::OptiBlastMatrix(string d, string nc, string f, bool s, double c, int l, float p, bool min) : OptiData(c), length(l), penalty(p), minWanted(min) {
13 
14     m = MothurOut::getInstance();
15 
16     distFile = d; format = f; sim = s;
17 
18     if (format == "name") { namefile = nc; countfile = ""; }
19     else if (format == "count") { countfile = nc; namefile = ""; }
20     else { countfile = ""; namefile = ""; }
21 
22     readBlast();
23 }
24 /***********************************************************************/
getOverlapName(long long index)25 string OptiBlastMatrix::getOverlapName(long long index) {
26     try {
27         if (index > blastOverlap.size()) { m->mothurOut("[ERROR]: index is not valid.\n"); m->setControl_pressed(true); return ""; }
28         string name = overlapNameMap[index];
29         return name;
30     }
31     catch(exception& e) {
32         m->errorOut(e, "OptiBlastMatrix", "getOverlapName");
33         exit(1);
34     }
35 }
36 /***********************************************************************/
readBlast()37 int OptiBlastMatrix::readBlast(){
38     try {
39         Utils util;
40         map<string, long long> nameAssignment;
41         if (namefile != "") { util.readNames(namefile, nameAssignment); }
42         else if (countfile != "") {
43             CountTable ct; ct.readTable(countfile, false, true);
44             map<string, int> temp = ct.getNameMap();
45             for (map<string, int>::iterator it = temp.begin(); it!= temp.end(); it++) {  nameAssignment[it->first] = it->second; }
46         }
47         else { readBlastNames(nameAssignment);  }
48         int count = 0;
49         for (map<string, long long>::iterator it = nameAssignment.begin(); it!= nameAssignment.end(); it++) {
50             it->second = count; count++;
51             nameMap.push_back(it->first);
52             overlapNameMap.push_back(it->first);
53         }
54 
55         m->mothurOut("Reading Blast File... "); cout.flush();
56 
57         string firstName, secondName, eScore, currentRow; currentRow = "";
58         string repeatName = "";
59         float distance, thisoverlap, refScore;
60         float percentId;
61         float numBases, mismatch, gap, startQuery, endQuery, startRef, endRef, score, lengthThisSeq;
62         map<string, float> thisRowsBlastScores;
63 
64         ///////////////////// Read to eliminate singletons ///////////////////////
65         ifstream fileHandle;
66         util.openInputFile(distFile, fileHandle);
67 
68         map<int, int> singletonIndexSwap;
69         map<int, int> blastSingletonIndexSwap;
70         vector<bool> singleton; singleton.resize(nameAssignment.size(), true);
71         vector<bool> overlapSingleton; overlapSingleton.resize(nameAssignment.size(), true);
72         vector< map<string,float> > dists;  dists.resize(nameAssignment.size());
73 
74         if (!fileHandle.eof()) {
75             //read in line from file
76             fileHandle >> firstName >> secondName >> percentId >> numBases >> mismatch >> gap >> startQuery >> endQuery >> startRef >> endRef >> eScore >> score;
77             util.gobble(fileHandle);
78 
79             currentRow = firstName;
80             lengthThisSeq = numBases;
81             repeatName = firstName + secondName;
82 
83             if (firstName == secondName) {   refScore = score;  }
84             else{
85                 thisRowsBlastScores[secondName] = score;
86 
87                 //calc overlap score
88                 thisoverlap = 1.0 - (percentId * (lengthThisSeq - startQuery) / endRef / 100.0 - penalty);
89 
90                 //if there is a valid overlap, add it
91                 if ((startRef <= length) && ((endQuery+length) >= lengthThisSeq) && (thisoverlap <= cutoff)) {
92                     //convert name to number
93                     map<string,long long>::iterator itA = nameAssignment.find(firstName);
94                     map<string,long long>::iterator itB = nameAssignment.find(secondName);
95                     if(itA == nameAssignment.end()){  m->mothurOut("AAError: Sequence '" + firstName + "' was not found in the names file, please correct\n"); exit(1);  }
96                     if(itB == nameAssignment.end()){  m->mothurOut("ABError: Sequence '" + secondName + "' was not found in the names file, please correct\n"); exit(1);  }
97 
98                     int indexA = (itA->second);
99                     int indexB = (itB->second);
100                     overlapSingleton[indexA] = false;
101                     overlapSingleton[indexB] = false;
102                     blastSingletonIndexSwap[indexA] = indexA;
103                     blastSingletonIndexSwap[indexB] = indexB;
104                 }
105             }
106         }else { m->mothurOut("Error in your blast file, cannot read.\n");  exit(1); }
107 
108 
109         while(fileHandle){  //let's assume it's a triangular matrix...
110 
111             if (m->getControl_pressed()) { fileHandle.close(); return 0; }
112 
113             //read in line from file
114             fileHandle >> firstName >> secondName >> percentId >> numBases >> mismatch >> gap >> startQuery >> endQuery >> startRef >> endRef >> eScore >> score;
115             util.gobble(fileHandle);
116 
117             string temp = firstName + secondName; //to check if this file has repeat lines, ie. is this a blast instead of a blscreen file
118 
119             //if this is a new pairing
120             if (temp != repeatName) {
121                 repeatName = temp;
122 
123                 if (currentRow == firstName) {
124                     if (firstName == secondName) {  refScore = score; }
125                     else{
126                         //save score
127                         thisRowsBlastScores[secondName] = score;
128 
129                         //calc overlap score
130                         thisoverlap = 1.0 - (percentId * (lengthThisSeq - startQuery) / endRef / 100.0 - penalty);
131 
132                         //if there is a valid overlap, add it
133                         if ((startRef <= length) && ((endQuery+length) >= lengthThisSeq) && (thisoverlap <= cutoff)) {
134                             //convert name to number
135                             map<string,long long>::iterator itA = nameAssignment.find(firstName);
136                             map<string,long long>::iterator itB = nameAssignment.find(secondName);
137                             if(itA == nameAssignment.end()){  m->mothurOut("AAError: Sequence '" + firstName + "' was not found in the names file, please correct\n"); exit(1);  }
138                             if(itB == nameAssignment.end()){  m->mothurOut("ABError: Sequence '" + secondName + "' was not found in the names file, please correct\n"); exit(1);  }
139                             int indexA = (itA->second);
140                             int indexB = (itB->second);
141                             overlapSingleton[indexA] = false;
142                             overlapSingleton[indexB] = false;
143                             blastSingletonIndexSwap[indexA] = indexA;
144                             blastSingletonIndexSwap[indexB] = indexB;
145                         }
146                     } //end else
147                 }else { //end row
148                     //convert blast scores to distance and add cell to sparse matrix if we can
149                     map<string, float>::iterator it;
150                     map<string, float>::iterator itDist;
151                     for(it=thisRowsBlastScores.begin(); it!=thisRowsBlastScores.end(); it++) {
152                         distance = 1.0 - (it->second / refScore);
153 
154                         //do we already have the distance calculated for b->a
155                         map<string,long long>::iterator itA = nameAssignment.find(currentRow);
156                         map<string,long long>::iterator itB = nameAssignment.find(it->first);
157                         itDist = dists[itB->second].find(itA->first);
158 
159                         //if we have it then compare
160                         if (itDist != dists[itB->second].end()) {
161 
162                             //if you want the minimum blast score ratio, then pick max distance
163                             if(minWanted) {	 distance = max(itDist->second, distance);  }
164                             else{	distance = min(itDist->second, distance);  }
165 
166                             //is this distance below cutoff
167                             if (distance <= cutoff) {
168                                 int indexA = (itA->second);
169                                 int indexB = (itB->second);
170                                 singleton[indexA] = false;
171                                 singleton[indexB] = false;
172                                 singletonIndexSwap[indexA] = indexA;
173                                 singletonIndexSwap[indexB] = indexB;
174                             }
175                             //not going to need this again
176                             dists[itB->second].erase(itDist);
177                         }else { //save this value until we get the other ratio
178                             dists[itA->second][it->first] = distance;
179                         }
180                     }
181                     //clear out last rows info
182                     thisRowsBlastScores.clear();
183 
184                     currentRow = firstName;
185                     lengthThisSeq = numBases;
186 
187                     //add this row to thisRowsBlastScores
188                     if (firstName == secondName) {   refScore = score;  }
189                     else{ //add this row to thisRowsBlastScores
190                         thisRowsBlastScores[secondName] = score;
191 
192                         //calc overlap score
193                         thisoverlap = 1.0 - (percentId * (lengthThisSeq - startQuery) / endRef / 100.0 - penalty);
194 
195                         //if there is a valid overlap, add it
196                         if ((startRef <= length) && ((endQuery+length) >= lengthThisSeq) && (thisoverlap <= cutoff)) {
197                             //convert name to number
198                             map<string,long long>::iterator itA = nameAssignment.find(firstName);
199                             map<string,long long>::iterator itB = nameAssignment.find(secondName);
200                             if(itA == nameAssignment.end()){  m->mothurOut("AAError: Sequence '" + firstName + "' was not found in the names file, please correct\n"); exit(1);  }
201                             if(itB == nameAssignment.end()){  m->mothurOut("ABError: Sequence '" + secondName + "' was not found in the names file, please correct\n"); exit(1);  }
202                             int indexA = (itA->second);
203                             int indexB = (itB->second);
204                             overlapSingleton[indexA] = false;
205                             overlapSingleton[indexB] = false;
206                             blastSingletonIndexSwap[indexA] = indexA;
207                             blastSingletonIndexSwap[indexB] = indexB;
208                         }
209                     }
210                 }//end if current row
211             }//end if repeat
212         }
213         fileHandle.close();
214 
215         //convert blast scores to distance and add cell to sparse matrix if we can
216         map<string, float>::iterator it;
217         map<string, float>::iterator itDist;
218         for(it=thisRowsBlastScores.begin(); it!=thisRowsBlastScores.end(); it++) {
219             distance = 1.0 - (it->second / refScore);
220 
221             //do we already have the distance calculated for b->a
222             map<string,long long>::iterator itA = nameAssignment.find(currentRow);
223             map<string,long long>::iterator itB = nameAssignment.find(it->first);
224             itDist = dists[itB->second].find(itA->first);
225 
226             //if we have it then compare
227             if (itDist != dists[itB->second].end()) {
228 
229                 //if you want the minimum blast score ratio, then pick max distance
230                 if(minWanted) {	 distance = max(itDist->second, distance);  }
231                 else{	distance = min(itDist->second, distance);  }
232 
233                 //is this distance below cutoff
234                 if (distance <= cutoff) {
235                     int indexA = (itA->second);
236                     int indexB = (itB->second);
237                     singleton[indexA] = false;
238                     singleton[indexB] = false;
239                     singletonIndexSwap[indexA] = indexA;
240                     singletonIndexSwap[indexB] = indexB;
241                 }
242                 //not going to need this again
243                 dists[itB->second].erase(itDist);
244             }else { //save this value until we get the other ratio
245                 dists[itA->second][it->first] = distance;
246             }
247         }
248         //clear out info
249         thisRowsBlastScores.clear();
250         dists.clear();
251 
252         //////////////////////////////////////////////////////////////////////////
253         int nonSingletonCount = 0;
254         for (int i = 0; i < singleton.size(); i++) {
255             if (!singleton[i]) { //if you are a singleton
256                 singletonIndexSwap[i] = nonSingletonCount;
257                 nonSingletonCount++;
258             }else { singletons.push_back(nameMap[i]); }
259         }
260         singleton.clear();
261 
262         int overlapNonSingletonCount = 0;
263         for (int i = 0; i < overlapSingleton.size(); i++) {
264             if (!overlapSingleton[i]) { //if you are a singleton
265                 blastSingletonIndexSwap[i] = overlapNonSingletonCount;
266                 overlapNonSingletonCount++;
267             }
268         }
269         overlapSingleton.clear();
270 
271         ifstream in;
272         util.openInputFile(distFile, in);
273 
274         dists.resize(nameAssignment.size());
275         closeness.resize(nonSingletonCount);
276         blastOverlap.resize(overlapNonSingletonCount);
277 
278         map<string, string> names;
279         if (namefile != "") {
280             util.readNames(namefile, names);
281             for (int i = 0; i < singletons.size(); i++) {
282                 singletons[i] = names[singletons[i]];
283             }
284         }
285 
286         m->mothurOut(" halfway ... "); cout.flush();
287 
288         if (!in.eof()) {
289             //read in line from file
290             in >> firstName >> secondName >> percentId >> numBases >> mismatch >> gap >> startQuery >> endQuery >> startRef >> endRef >> eScore >> score;
291             util.gobble(fileHandle);
292 
293             currentRow = firstName;
294             lengthThisSeq = numBases;
295             repeatName = firstName + secondName;
296 
297             if (firstName == secondName) {   refScore = score;  }
298             else{
299                 //convert name to number
300                 map<string,long long>::iterator itA = nameAssignment.find(firstName);
301                 map<string,long long>::iterator itB = nameAssignment.find(secondName);
302                 if(itA == nameAssignment.end()){  m->mothurOut("AAError: Sequence '" + firstName + "' was not found in the names file, please correct\n"); exit(1);  }
303                 if(itB == nameAssignment.end()){  m->mothurOut("ABError: Sequence '" + secondName + "' was not found in the names file, please correct\n"); exit(1);  }
304 
305                 thisRowsBlastScores[secondName] = score;
306 
307                 if (namefile != "") {
308                     firstName = names[firstName];  //redundant names
309                     secondName = names[secondName]; //redundant names
310                 }
311 
312                 nameMap[singletonIndexSwap[itA->second]] = firstName;
313                 nameMap[singletonIndexSwap[itB->second]] = secondName;
314 
315                 //calc overlap score
316                 thisoverlap = 1.0 - (percentId * (lengthThisSeq - startQuery) / endRef / 100.0 - penalty);
317 
318                 //if there is a valid overlap, add it
319                 if ((startRef <= length) && ((endQuery+length) >= lengthThisSeq) && (thisoverlap <= cutoff)) {
320                     int indexA = (itA->second);
321                     int indexB = (itB->second);
322 
323                     int newB = blastSingletonIndexSwap[indexB];
324                     int newA = blastSingletonIndexSwap[indexA];
325                     blastOverlap[newA].insert(newB);
326                     blastOverlap[newB].insert(newA);
327 
328                     overlapNameMap[newA] = firstName;
329                     overlapNameMap[newB] = secondName;
330                 }
331             }
332         }else { m->mothurOut("Error in your blast file, cannot read.\n");  exit(1); }
333 
334 
335         while(in){  //let's assume it's a triangular matrix...
336 
337             if (m->getControl_pressed()) { fileHandle.close(); return 0; }
338 
339             //read in line from file
340             in >> firstName >> secondName >> percentId >> numBases >> mismatch >> gap >> startQuery >> endQuery >> startRef >> endRef >> eScore >> score;
341             util.gobble(fileHandle);
342 
343             string temp = firstName + secondName; //to check if this file has repeat lines, ie. is this a blast instead of a blscreen file
344 
345             //if this is a new pairing
346             if (temp != repeatName) {
347                 repeatName = temp;
348 
349                 if (currentRow == firstName) {
350                     if (firstName == secondName) {  refScore = score; }
351                     else{
352                         //convert name to number
353                         map<string,long long>::iterator itA = nameAssignment.find(firstName);
354                         map<string,long long>::iterator itB = nameAssignment.find(secondName);
355                         if(itA == nameAssignment.end()){  m->mothurOut("AAError: Sequence '" + firstName + "' was not found in the names file, please correct\n"); exit(1);  }
356                         if(itB == nameAssignment.end()){  m->mothurOut("ABError: Sequence '" + secondName + "' was not found in the names file, please correct\n"); exit(1);  }
357 
358                         //save score
359                         thisRowsBlastScores[secondName] = score;
360 
361                         if (namefile != "") {
362                             firstName = names[firstName];  //redundant names
363                             secondName = names[secondName]; //redundant names
364                         }
365 
366                         nameMap[singletonIndexSwap[itA->second]] = firstName;
367                         nameMap[singletonIndexSwap[itB->second]] = secondName;
368 
369                         //calc overlap score
370                         thisoverlap = 1.0 - (percentId * (lengthThisSeq - startQuery) / endRef / 100.0 - penalty);
371 
372                         //if there is a valid overlap, add it
373                         if ((startRef <= length) && ((endQuery+length) >= lengthThisSeq) && (thisoverlap <= cutoff)) {
374                             int indexA = (itA->second);
375                             int indexB = (itB->second);
376 
377                             int newB = blastSingletonIndexSwap[indexB];
378                             int newA = blastSingletonIndexSwap[indexA];
379                             blastOverlap[newA].insert(newB);
380                             blastOverlap[newB].insert(newA);
381 
382                             overlapNameMap[newA] = firstName;
383                             overlapNameMap[newB] = secondName;                        }
384                     } //end else
385                 }else { //end row
386                     //convert blast scores to distance and add cell to sparse matrix if we can
387                     map<string, float>::iterator it;
388                     map<string, float>::iterator itDist;
389                     for(it=thisRowsBlastScores.begin(); it!=thisRowsBlastScores.end(); it++) {
390                         distance = 1.0 - (it->second / refScore);
391 
392                         //do we already have the distance calculated for b->a
393                         map<string,long long>::iterator itA = nameAssignment.find(currentRow);
394                         map<string,long long>::iterator itB = nameAssignment.find(it->first);
395                         itDist = dists[itB->second].find(itA->first);
396 
397                         //if we have it then compare
398                         if (itDist != dists[itB->second].end()) {
399 
400                             //if you want the minimum blast score ratio, then pick max distance
401                             if(minWanted) {	 distance = max(itDist->second, distance);  }
402                             else{	distance = min(itDist->second, distance);  }
403 
404                             //is this distance below cutoff
405                             if (distance <= cutoff) {
406                                 int indexA = (itA->second);
407                                 int indexB = (itB->second);
408 
409                                 int newB = singletonIndexSwap[indexB];
410                                 int newA = singletonIndexSwap[indexA];
411                                 closeness[newA].insert(newB);
412                                 closeness[newB].insert(newA);
413                             }
414                             //not going to need this again
415                             dists[itB->second].erase(itDist);
416                         }else { //save this value until we get the other ratio
417                             dists[itA->second][it->first] = distance;
418                         }
419                     }
420                     //clear out last rows info
421                     thisRowsBlastScores.clear();
422 
423                     currentRow = firstName;
424                     lengthThisSeq = numBases;
425 
426                     //add this row to thisRowsBlastScores
427                     if (firstName == secondName) {   refScore = score;  }
428                     else{ //add this row to thisRowsBlastScores
429 
430                         //convert name to number
431                         map<string,long long>::iterator itA = nameAssignment.find(firstName);
432                         map<string,long long>::iterator itB = nameAssignment.find(secondName);
433                         if(itA == nameAssignment.end()){  m->mothurOut("AError: Sequence '" + firstName + "' was not found in the names file, please correct\n"); exit(1);  }
434                         if(itB == nameAssignment.end()){  m->mothurOut("BError: Sequence '" + secondName + "' was not found in the names file, please correct\n"); exit(1);  }
435 
436                         thisRowsBlastScores[secondName] = score;
437 
438                         //calc overlap score
439                         thisoverlap = 1.0 - (percentId * (lengthThisSeq - startQuery) / endRef / 100.0 - penalty);
440 
441                         //if there is a valid overlap, add it
442                         if ((startRef <= length) && ((endQuery+length) >= lengthThisSeq) && (thisoverlap <= cutoff)) {
443                             int indexA = (itA->second);
444                             int indexB = (itB->second);
445 
446                             int newB = blastSingletonIndexSwap[indexB];
447                             int newA = blastSingletonIndexSwap[indexA];
448                             blastOverlap[newA].insert(newB);
449                             blastOverlap[newB].insert(newA);
450 
451                             overlapNameMap[newA] = firstName;
452                             overlapNameMap[newB] = secondName;
453                         }
454                     }
455                 }//end if current row
456             }//end if repeat
457         }
458         in.close();
459 
460         //convert blast scores to distance and add cell to sparse matrix if we can
461         for(it=thisRowsBlastScores.begin(); it!=thisRowsBlastScores.end(); it++) {
462             distance = 1.0 - (it->second / refScore);
463 
464             //do we already have the distance calculated for b->a
465             map<string,long long>::iterator itA = nameAssignment.find(currentRow);
466             map<string,long long>::iterator itB = nameAssignment.find(it->first);
467             itDist = dists[itB->second].find(itA->first);
468 
469             //if we have it then compare
470             if (itDist != dists[itB->second].end()) {
471 
472                 //if you want the minimum blast score ratio, then pick max distance
473                 if(minWanted) {	 distance = max(itDist->second, distance);  }
474                 else{	distance = min(itDist->second, distance);  }
475 
476                 //is this distance below cutoff
477                 if (distance <= cutoff) {
478                     int indexA = (itA->second);
479                     int indexB = (itB->second);
480 
481                     int newB = singletonIndexSwap[indexB];
482                     int newA = singletonIndexSwap[indexA];
483                     closeness[newA].insert(newB);
484                     closeness[newB].insert(newA);
485                 }
486                 //not going to need this again
487                 dists[itB->second].erase(itDist);
488             }else { //save this value until we get the other ratio
489                 dists[itA->second][it->first] = distance;
490             }
491         }
492         //clear out info
493         thisRowsBlastScores.clear();
494         dists.clear();
495         nameAssignment.clear();
496 
497         m->mothurOut(" done.\n");
498 
499         return 1;
500     }
501     catch(exception& e) {
502         m->errorOut(e, "OptiBlastMatrix", "readBlast");
503         exit(1);
504     }
505 }
506 /*********************************************************************************************/
readBlastNames(map<string,long long> & nameAssignment)507 int OptiBlastMatrix::readBlastNames(map<string, long long>& nameAssignment) {
508     try {
509         m->mothurOut("Reading names... "); cout.flush();
510 
511         string name, hold, prevName;
512         int num = 0;
513 
514         ifstream in;
515         Utils util; util.openInputFile(distFile, in);
516 
517         //read first line
518         in >> prevName;
519 
520         for (int i = 0; i < 11; i++) {  in >> hold;  }
521         util.gobble(in);
522 
523         //save name in nameMap
524         nameAssignment[prevName] = num; num++;
525 
526         map<string, long long>::iterator it;
527         while (!in.eof()) {
528             if (m->getControl_pressed()) { in.close(); return 0; }
529 
530             //read line
531             in >> name;
532 
533             for (int i = 0; i < 11; i++) {  in >> hold;  }
534             util.gobble(in);
535 
536             //is this a new name?
537             if (name != prevName) {
538                 prevName = name;
539 
540                 it = nameAssignment.find(name);
541                 if (it != nameAssignment.end()) { m->mothurOut("[ERROR]: trying to exact names from blast file, and I found dups.  Are you sequence names unique? quitting.\n"); m->setControl_pressed(true); }
542 
543                 else { nameAssignment[name] = num; num++; }
544             }
545         }
546 
547         in.close();
548 
549         if (m->getControl_pressed()) { return 0; }
550 
551         m->mothurOut(toString(num) + " names read.\n");
552 
553         return 0;
554 
555     }
556     catch(exception& e) {
557         m->errorOut(e, "OptiBlastMatrix", "readBlastNames");
558         exit(1);
559     }
560 }
561 /***********************************************************************/
562 
563 
564 
565