1 //
2 // optiblastmatrix.cpp
3 // Mothur
4 //
5 // Created by Sarah Westcott on 5/10/18.
6 // Copyright © 2018 Schloss Lab. All rights reserved.
7 //
8
9 #include "optiblastmatrix.hpp"
10
11 /***********************************************************************/
OptiBlastMatrix(string d,string nc,string f,bool s,double c,int l,float p,bool min)12 OptiBlastMatrix::OptiBlastMatrix(string d, string nc, string f, bool s, double c, int l, float p, bool min) : OptiData(c), length(l), penalty(p), minWanted(min) {
13
14 m = MothurOut::getInstance();
15
16 distFile = d; format = f; sim = s;
17
18 if (format == "name") { namefile = nc; countfile = ""; }
19 else if (format == "count") { countfile = nc; namefile = ""; }
20 else { countfile = ""; namefile = ""; }
21
22 readBlast();
23 }
24 /***********************************************************************/
getOverlapName(long long index)25 string OptiBlastMatrix::getOverlapName(long long index) {
26 try {
27 if (index > blastOverlap.size()) { m->mothurOut("[ERROR]: index is not valid.\n"); m->setControl_pressed(true); return ""; }
28 string name = overlapNameMap[index];
29 return name;
30 }
31 catch(exception& e) {
32 m->errorOut(e, "OptiBlastMatrix", "getOverlapName");
33 exit(1);
34 }
35 }
36 /***********************************************************************/
readBlast()37 int OptiBlastMatrix::readBlast(){
38 try {
39 Utils util;
40 map<string, long long> nameAssignment;
41 if (namefile != "") { util.readNames(namefile, nameAssignment); }
42 else if (countfile != "") {
43 CountTable ct; ct.readTable(countfile, false, true);
44 map<string, int> temp = ct.getNameMap();
45 for (map<string, int>::iterator it = temp.begin(); it!= temp.end(); it++) { nameAssignment[it->first] = it->second; }
46 }
47 else { readBlastNames(nameAssignment); }
48 int count = 0;
49 for (map<string, long long>::iterator it = nameAssignment.begin(); it!= nameAssignment.end(); it++) {
50 it->second = count; count++;
51 nameMap.push_back(it->first);
52 overlapNameMap.push_back(it->first);
53 }
54
55 m->mothurOut("Reading Blast File... "); cout.flush();
56
57 string firstName, secondName, eScore, currentRow; currentRow = "";
58 string repeatName = "";
59 float distance, thisoverlap, refScore;
60 float percentId;
61 float numBases, mismatch, gap, startQuery, endQuery, startRef, endRef, score, lengthThisSeq;
62 map<string, float> thisRowsBlastScores;
63
64 ///////////////////// Read to eliminate singletons ///////////////////////
65 ifstream fileHandle;
66 util.openInputFile(distFile, fileHandle);
67
68 map<int, int> singletonIndexSwap;
69 map<int, int> blastSingletonIndexSwap;
70 vector<bool> singleton; singleton.resize(nameAssignment.size(), true);
71 vector<bool> overlapSingleton; overlapSingleton.resize(nameAssignment.size(), true);
72 vector< map<string,float> > dists; dists.resize(nameAssignment.size());
73
74 if (!fileHandle.eof()) {
75 //read in line from file
76 fileHandle >> firstName >> secondName >> percentId >> numBases >> mismatch >> gap >> startQuery >> endQuery >> startRef >> endRef >> eScore >> score;
77 util.gobble(fileHandle);
78
79 currentRow = firstName;
80 lengthThisSeq = numBases;
81 repeatName = firstName + secondName;
82
83 if (firstName == secondName) { refScore = score; }
84 else{
85 thisRowsBlastScores[secondName] = score;
86
87 //calc overlap score
88 thisoverlap = 1.0 - (percentId * (lengthThisSeq - startQuery) / endRef / 100.0 - penalty);
89
90 //if there is a valid overlap, add it
91 if ((startRef <= length) && ((endQuery+length) >= lengthThisSeq) && (thisoverlap <= cutoff)) {
92 //convert name to number
93 map<string,long long>::iterator itA = nameAssignment.find(firstName);
94 map<string,long long>::iterator itB = nameAssignment.find(secondName);
95 if(itA == nameAssignment.end()){ m->mothurOut("AAError: Sequence '" + firstName + "' was not found in the names file, please correct\n"); exit(1); }
96 if(itB == nameAssignment.end()){ m->mothurOut("ABError: Sequence '" + secondName + "' was not found in the names file, please correct\n"); exit(1); }
97
98 int indexA = (itA->second);
99 int indexB = (itB->second);
100 overlapSingleton[indexA] = false;
101 overlapSingleton[indexB] = false;
102 blastSingletonIndexSwap[indexA] = indexA;
103 blastSingletonIndexSwap[indexB] = indexB;
104 }
105 }
106 }else { m->mothurOut("Error in your blast file, cannot read.\n"); exit(1); }
107
108
109 while(fileHandle){ //let's assume it's a triangular matrix...
110
111 if (m->getControl_pressed()) { fileHandle.close(); return 0; }
112
113 //read in line from file
114 fileHandle >> firstName >> secondName >> percentId >> numBases >> mismatch >> gap >> startQuery >> endQuery >> startRef >> endRef >> eScore >> score;
115 util.gobble(fileHandle);
116
117 string temp = firstName + secondName; //to check if this file has repeat lines, ie. is this a blast instead of a blscreen file
118
119 //if this is a new pairing
120 if (temp != repeatName) {
121 repeatName = temp;
122
123 if (currentRow == firstName) {
124 if (firstName == secondName) { refScore = score; }
125 else{
126 //save score
127 thisRowsBlastScores[secondName] = score;
128
129 //calc overlap score
130 thisoverlap = 1.0 - (percentId * (lengthThisSeq - startQuery) / endRef / 100.0 - penalty);
131
132 //if there is a valid overlap, add it
133 if ((startRef <= length) && ((endQuery+length) >= lengthThisSeq) && (thisoverlap <= cutoff)) {
134 //convert name to number
135 map<string,long long>::iterator itA = nameAssignment.find(firstName);
136 map<string,long long>::iterator itB = nameAssignment.find(secondName);
137 if(itA == nameAssignment.end()){ m->mothurOut("AAError: Sequence '" + firstName + "' was not found in the names file, please correct\n"); exit(1); }
138 if(itB == nameAssignment.end()){ m->mothurOut("ABError: Sequence '" + secondName + "' was not found in the names file, please correct\n"); exit(1); }
139 int indexA = (itA->second);
140 int indexB = (itB->second);
141 overlapSingleton[indexA] = false;
142 overlapSingleton[indexB] = false;
143 blastSingletonIndexSwap[indexA] = indexA;
144 blastSingletonIndexSwap[indexB] = indexB;
145 }
146 } //end else
147 }else { //end row
148 //convert blast scores to distance and add cell to sparse matrix if we can
149 map<string, float>::iterator it;
150 map<string, float>::iterator itDist;
151 for(it=thisRowsBlastScores.begin(); it!=thisRowsBlastScores.end(); it++) {
152 distance = 1.0 - (it->second / refScore);
153
154 //do we already have the distance calculated for b->a
155 map<string,long long>::iterator itA = nameAssignment.find(currentRow);
156 map<string,long long>::iterator itB = nameAssignment.find(it->first);
157 itDist = dists[itB->second].find(itA->first);
158
159 //if we have it then compare
160 if (itDist != dists[itB->second].end()) {
161
162 //if you want the minimum blast score ratio, then pick max distance
163 if(minWanted) { distance = max(itDist->second, distance); }
164 else{ distance = min(itDist->second, distance); }
165
166 //is this distance below cutoff
167 if (distance <= cutoff) {
168 int indexA = (itA->second);
169 int indexB = (itB->second);
170 singleton[indexA] = false;
171 singleton[indexB] = false;
172 singletonIndexSwap[indexA] = indexA;
173 singletonIndexSwap[indexB] = indexB;
174 }
175 //not going to need this again
176 dists[itB->second].erase(itDist);
177 }else { //save this value until we get the other ratio
178 dists[itA->second][it->first] = distance;
179 }
180 }
181 //clear out last rows info
182 thisRowsBlastScores.clear();
183
184 currentRow = firstName;
185 lengthThisSeq = numBases;
186
187 //add this row to thisRowsBlastScores
188 if (firstName == secondName) { refScore = score; }
189 else{ //add this row to thisRowsBlastScores
190 thisRowsBlastScores[secondName] = score;
191
192 //calc overlap score
193 thisoverlap = 1.0 - (percentId * (lengthThisSeq - startQuery) / endRef / 100.0 - penalty);
194
195 //if there is a valid overlap, add it
196 if ((startRef <= length) && ((endQuery+length) >= lengthThisSeq) && (thisoverlap <= cutoff)) {
197 //convert name to number
198 map<string,long long>::iterator itA = nameAssignment.find(firstName);
199 map<string,long long>::iterator itB = nameAssignment.find(secondName);
200 if(itA == nameAssignment.end()){ m->mothurOut("AAError: Sequence '" + firstName + "' was not found in the names file, please correct\n"); exit(1); }
201 if(itB == nameAssignment.end()){ m->mothurOut("ABError: Sequence '" + secondName + "' was not found in the names file, please correct\n"); exit(1); }
202 int indexA = (itA->second);
203 int indexB = (itB->second);
204 overlapSingleton[indexA] = false;
205 overlapSingleton[indexB] = false;
206 blastSingletonIndexSwap[indexA] = indexA;
207 blastSingletonIndexSwap[indexB] = indexB;
208 }
209 }
210 }//end if current row
211 }//end if repeat
212 }
213 fileHandle.close();
214
215 //convert blast scores to distance and add cell to sparse matrix if we can
216 map<string, float>::iterator it;
217 map<string, float>::iterator itDist;
218 for(it=thisRowsBlastScores.begin(); it!=thisRowsBlastScores.end(); it++) {
219 distance = 1.0 - (it->second / refScore);
220
221 //do we already have the distance calculated for b->a
222 map<string,long long>::iterator itA = nameAssignment.find(currentRow);
223 map<string,long long>::iterator itB = nameAssignment.find(it->first);
224 itDist = dists[itB->second].find(itA->first);
225
226 //if we have it then compare
227 if (itDist != dists[itB->second].end()) {
228
229 //if you want the minimum blast score ratio, then pick max distance
230 if(minWanted) { distance = max(itDist->second, distance); }
231 else{ distance = min(itDist->second, distance); }
232
233 //is this distance below cutoff
234 if (distance <= cutoff) {
235 int indexA = (itA->second);
236 int indexB = (itB->second);
237 singleton[indexA] = false;
238 singleton[indexB] = false;
239 singletonIndexSwap[indexA] = indexA;
240 singletonIndexSwap[indexB] = indexB;
241 }
242 //not going to need this again
243 dists[itB->second].erase(itDist);
244 }else { //save this value until we get the other ratio
245 dists[itA->second][it->first] = distance;
246 }
247 }
248 //clear out info
249 thisRowsBlastScores.clear();
250 dists.clear();
251
252 //////////////////////////////////////////////////////////////////////////
253 int nonSingletonCount = 0;
254 for (int i = 0; i < singleton.size(); i++) {
255 if (!singleton[i]) { //if you are a singleton
256 singletonIndexSwap[i] = nonSingletonCount;
257 nonSingletonCount++;
258 }else { singletons.push_back(nameMap[i]); }
259 }
260 singleton.clear();
261
262 int overlapNonSingletonCount = 0;
263 for (int i = 0; i < overlapSingleton.size(); i++) {
264 if (!overlapSingleton[i]) { //if you are a singleton
265 blastSingletonIndexSwap[i] = overlapNonSingletonCount;
266 overlapNonSingletonCount++;
267 }
268 }
269 overlapSingleton.clear();
270
271 ifstream in;
272 util.openInputFile(distFile, in);
273
274 dists.resize(nameAssignment.size());
275 closeness.resize(nonSingletonCount);
276 blastOverlap.resize(overlapNonSingletonCount);
277
278 map<string, string> names;
279 if (namefile != "") {
280 util.readNames(namefile, names);
281 for (int i = 0; i < singletons.size(); i++) {
282 singletons[i] = names[singletons[i]];
283 }
284 }
285
286 m->mothurOut(" halfway ... "); cout.flush();
287
288 if (!in.eof()) {
289 //read in line from file
290 in >> firstName >> secondName >> percentId >> numBases >> mismatch >> gap >> startQuery >> endQuery >> startRef >> endRef >> eScore >> score;
291 util.gobble(fileHandle);
292
293 currentRow = firstName;
294 lengthThisSeq = numBases;
295 repeatName = firstName + secondName;
296
297 if (firstName == secondName) { refScore = score; }
298 else{
299 //convert name to number
300 map<string,long long>::iterator itA = nameAssignment.find(firstName);
301 map<string,long long>::iterator itB = nameAssignment.find(secondName);
302 if(itA == nameAssignment.end()){ m->mothurOut("AAError: Sequence '" + firstName + "' was not found in the names file, please correct\n"); exit(1); }
303 if(itB == nameAssignment.end()){ m->mothurOut("ABError: Sequence '" + secondName + "' was not found in the names file, please correct\n"); exit(1); }
304
305 thisRowsBlastScores[secondName] = score;
306
307 if (namefile != "") {
308 firstName = names[firstName]; //redundant names
309 secondName = names[secondName]; //redundant names
310 }
311
312 nameMap[singletonIndexSwap[itA->second]] = firstName;
313 nameMap[singletonIndexSwap[itB->second]] = secondName;
314
315 //calc overlap score
316 thisoverlap = 1.0 - (percentId * (lengthThisSeq - startQuery) / endRef / 100.0 - penalty);
317
318 //if there is a valid overlap, add it
319 if ((startRef <= length) && ((endQuery+length) >= lengthThisSeq) && (thisoverlap <= cutoff)) {
320 int indexA = (itA->second);
321 int indexB = (itB->second);
322
323 int newB = blastSingletonIndexSwap[indexB];
324 int newA = blastSingletonIndexSwap[indexA];
325 blastOverlap[newA].insert(newB);
326 blastOverlap[newB].insert(newA);
327
328 overlapNameMap[newA] = firstName;
329 overlapNameMap[newB] = secondName;
330 }
331 }
332 }else { m->mothurOut("Error in your blast file, cannot read.\n"); exit(1); }
333
334
335 while(in){ //let's assume it's a triangular matrix...
336
337 if (m->getControl_pressed()) { fileHandle.close(); return 0; }
338
339 //read in line from file
340 in >> firstName >> secondName >> percentId >> numBases >> mismatch >> gap >> startQuery >> endQuery >> startRef >> endRef >> eScore >> score;
341 util.gobble(fileHandle);
342
343 string temp = firstName + secondName; //to check if this file has repeat lines, ie. is this a blast instead of a blscreen file
344
345 //if this is a new pairing
346 if (temp != repeatName) {
347 repeatName = temp;
348
349 if (currentRow == firstName) {
350 if (firstName == secondName) { refScore = score; }
351 else{
352 //convert name to number
353 map<string,long long>::iterator itA = nameAssignment.find(firstName);
354 map<string,long long>::iterator itB = nameAssignment.find(secondName);
355 if(itA == nameAssignment.end()){ m->mothurOut("AAError: Sequence '" + firstName + "' was not found in the names file, please correct\n"); exit(1); }
356 if(itB == nameAssignment.end()){ m->mothurOut("ABError: Sequence '" + secondName + "' was not found in the names file, please correct\n"); exit(1); }
357
358 //save score
359 thisRowsBlastScores[secondName] = score;
360
361 if (namefile != "") {
362 firstName = names[firstName]; //redundant names
363 secondName = names[secondName]; //redundant names
364 }
365
366 nameMap[singletonIndexSwap[itA->second]] = firstName;
367 nameMap[singletonIndexSwap[itB->second]] = secondName;
368
369 //calc overlap score
370 thisoverlap = 1.0 - (percentId * (lengthThisSeq - startQuery) / endRef / 100.0 - penalty);
371
372 //if there is a valid overlap, add it
373 if ((startRef <= length) && ((endQuery+length) >= lengthThisSeq) && (thisoverlap <= cutoff)) {
374 int indexA = (itA->second);
375 int indexB = (itB->second);
376
377 int newB = blastSingletonIndexSwap[indexB];
378 int newA = blastSingletonIndexSwap[indexA];
379 blastOverlap[newA].insert(newB);
380 blastOverlap[newB].insert(newA);
381
382 overlapNameMap[newA] = firstName;
383 overlapNameMap[newB] = secondName; }
384 } //end else
385 }else { //end row
386 //convert blast scores to distance and add cell to sparse matrix if we can
387 map<string, float>::iterator it;
388 map<string, float>::iterator itDist;
389 for(it=thisRowsBlastScores.begin(); it!=thisRowsBlastScores.end(); it++) {
390 distance = 1.0 - (it->second / refScore);
391
392 //do we already have the distance calculated for b->a
393 map<string,long long>::iterator itA = nameAssignment.find(currentRow);
394 map<string,long long>::iterator itB = nameAssignment.find(it->first);
395 itDist = dists[itB->second].find(itA->first);
396
397 //if we have it then compare
398 if (itDist != dists[itB->second].end()) {
399
400 //if you want the minimum blast score ratio, then pick max distance
401 if(minWanted) { distance = max(itDist->second, distance); }
402 else{ distance = min(itDist->second, distance); }
403
404 //is this distance below cutoff
405 if (distance <= cutoff) {
406 int indexA = (itA->second);
407 int indexB = (itB->second);
408
409 int newB = singletonIndexSwap[indexB];
410 int newA = singletonIndexSwap[indexA];
411 closeness[newA].insert(newB);
412 closeness[newB].insert(newA);
413 }
414 //not going to need this again
415 dists[itB->second].erase(itDist);
416 }else { //save this value until we get the other ratio
417 dists[itA->second][it->first] = distance;
418 }
419 }
420 //clear out last rows info
421 thisRowsBlastScores.clear();
422
423 currentRow = firstName;
424 lengthThisSeq = numBases;
425
426 //add this row to thisRowsBlastScores
427 if (firstName == secondName) { refScore = score; }
428 else{ //add this row to thisRowsBlastScores
429
430 //convert name to number
431 map<string,long long>::iterator itA = nameAssignment.find(firstName);
432 map<string,long long>::iterator itB = nameAssignment.find(secondName);
433 if(itA == nameAssignment.end()){ m->mothurOut("AError: Sequence '" + firstName + "' was not found in the names file, please correct\n"); exit(1); }
434 if(itB == nameAssignment.end()){ m->mothurOut("BError: Sequence '" + secondName + "' was not found in the names file, please correct\n"); exit(1); }
435
436 thisRowsBlastScores[secondName] = score;
437
438 //calc overlap score
439 thisoverlap = 1.0 - (percentId * (lengthThisSeq - startQuery) / endRef / 100.0 - penalty);
440
441 //if there is a valid overlap, add it
442 if ((startRef <= length) && ((endQuery+length) >= lengthThisSeq) && (thisoverlap <= cutoff)) {
443 int indexA = (itA->second);
444 int indexB = (itB->second);
445
446 int newB = blastSingletonIndexSwap[indexB];
447 int newA = blastSingletonIndexSwap[indexA];
448 blastOverlap[newA].insert(newB);
449 blastOverlap[newB].insert(newA);
450
451 overlapNameMap[newA] = firstName;
452 overlapNameMap[newB] = secondName;
453 }
454 }
455 }//end if current row
456 }//end if repeat
457 }
458 in.close();
459
460 //convert blast scores to distance and add cell to sparse matrix if we can
461 for(it=thisRowsBlastScores.begin(); it!=thisRowsBlastScores.end(); it++) {
462 distance = 1.0 - (it->second / refScore);
463
464 //do we already have the distance calculated for b->a
465 map<string,long long>::iterator itA = nameAssignment.find(currentRow);
466 map<string,long long>::iterator itB = nameAssignment.find(it->first);
467 itDist = dists[itB->second].find(itA->first);
468
469 //if we have it then compare
470 if (itDist != dists[itB->second].end()) {
471
472 //if you want the minimum blast score ratio, then pick max distance
473 if(minWanted) { distance = max(itDist->second, distance); }
474 else{ distance = min(itDist->second, distance); }
475
476 //is this distance below cutoff
477 if (distance <= cutoff) {
478 int indexA = (itA->second);
479 int indexB = (itB->second);
480
481 int newB = singletonIndexSwap[indexB];
482 int newA = singletonIndexSwap[indexA];
483 closeness[newA].insert(newB);
484 closeness[newB].insert(newA);
485 }
486 //not going to need this again
487 dists[itB->second].erase(itDist);
488 }else { //save this value until we get the other ratio
489 dists[itA->second][it->first] = distance;
490 }
491 }
492 //clear out info
493 thisRowsBlastScores.clear();
494 dists.clear();
495 nameAssignment.clear();
496
497 m->mothurOut(" done.\n");
498
499 return 1;
500 }
501 catch(exception& e) {
502 m->errorOut(e, "OptiBlastMatrix", "readBlast");
503 exit(1);
504 }
505 }
506 /*********************************************************************************************/
readBlastNames(map<string,long long> & nameAssignment)507 int OptiBlastMatrix::readBlastNames(map<string, long long>& nameAssignment) {
508 try {
509 m->mothurOut("Reading names... "); cout.flush();
510
511 string name, hold, prevName;
512 int num = 0;
513
514 ifstream in;
515 Utils util; util.openInputFile(distFile, in);
516
517 //read first line
518 in >> prevName;
519
520 for (int i = 0; i < 11; i++) { in >> hold; }
521 util.gobble(in);
522
523 //save name in nameMap
524 nameAssignment[prevName] = num; num++;
525
526 map<string, long long>::iterator it;
527 while (!in.eof()) {
528 if (m->getControl_pressed()) { in.close(); return 0; }
529
530 //read line
531 in >> name;
532
533 for (int i = 0; i < 11; i++) { in >> hold; }
534 util.gobble(in);
535
536 //is this a new name?
537 if (name != prevName) {
538 prevName = name;
539
540 it = nameAssignment.find(name);
541 if (it != nameAssignment.end()) { m->mothurOut("[ERROR]: trying to exact names from blast file, and I found dups. Are you sequence names unique? quitting.\n"); m->setControl_pressed(true); }
542
543 else { nameAssignment[name] = num; num++; }
544 }
545 }
546
547 in.close();
548
549 if (m->getControl_pressed()) { return 0; }
550
551 m->mothurOut(toString(num) + " names read.\n");
552
553 return 0;
554
555 }
556 catch(exception& e) {
557 m->errorOut(e, "OptiBlastMatrix", "readBlastNames");
558 exit(1);
559 }
560 }
561 /***********************************************************************/
562
563
564
565