1 //
2 // optirefmatrix.cpp
3 // Mothur
4 //
5 // Created by Sarah Westcott on 5/3/18.
6 // Copyright © 2018 Schloss Lab. All rights reserved.
7 //
8
9 #include "optirefmatrix.hpp"
10 #include "counttable.h"
11
12 /***********************************************************************/
OptiRefMatrix(string distFile,string distFormat,string dupsFile,string dupsFormat,double c,float fP,string refWeight)13 OptiRefMatrix::OptiRefMatrix(string distFile, string distFormat, string dupsFile, string dupsFormat, double c, float fP, string refWeight) : OptiData(c) {
14
15 numFitSingletons = 0;
16 numRefSingletons = 0;
17 numSingletons = 0;
18 numBetweenDists = 0;
19 numFitDists = 0;
20 numRefDists = 0;
21 numFitSeqs = 0;
22 refWeightMethod = refWeight;
23
24 fitPercent = fP / 100.0;
25 if (fitPercent < 0.001) { fitPercent = 0.10; m->mothurOut("[WARNING]: fit percentage must be between 0.001 (0.1%) and 1.0 (100%). Setting to 0.10 or 10%. \n"); } //minumum of 0.1%
26 else if (fitPercent > 100.0) { m->mothurOut("[ERROR]: fit percentage must be between 0.0001 and 100.0\n"); m->setControl_pressed(true); }
27
28 square = false;
29
30 set<string> noRefNamesSet;
31 readFiles(distFile, distFormat, dupsFile, dupsFormat, noRefNamesSet);
32 }
33 /***********************************************************************/
OptiRefMatrix(string distFile,string distFormat,string dupsFile,string dupsFormat,double c,set<string> accnosRefFileNames)34 OptiRefMatrix::OptiRefMatrix(string distFile, string distFormat, string dupsFile, string dupsFormat, double c, set<string> accnosRefFileNames) : OptiData(c) {
35
36 numFitSingletons = 0;
37 numRefSingletons = 0;
38 numSingletons = 0;
39 numBetweenDists = 0;
40 numFitDists = 0;
41 numRefDists = 0;
42 numFitSeqs = 0;
43 refWeightMethod = "accnos";
44
45 square = false;
46
47 readFiles(distFile, distFormat, dupsFile, dupsFormat, accnosRefFileNames);
48 }
49
50 /***********************************************************************/
OptiRefMatrix(string d,string nc,string f,string df,double c,string fit,string fitnc,string fitf,string fitdf,string betweend,string betweendf)51 OptiRefMatrix::OptiRefMatrix(string d, string nc, string f, string df, double c, string fit, string fitnc, string fitf, string fitdf, string betweend, string betweendf) : OptiData(c) {
52
53 string refdistfile, refnamefile, refcountfile, refformat, refdistformat, fitdistfile, fitnamefile, fitcountfile, fitformat, fitdistformat, betweendistfile, betweendistformat;
54
55 refdistfile = d; refdistformat = df; refformat = f; fitdistfile = fit; fitdistformat = fitdf; fitformat = fitf; betweendistfile = betweend; betweendistformat = betweendf;
56
57 numFitSingletons = 0;
58 numRefSingletons = 0;
59 numSingletons = 0;
60 numBetweenDists = 0;
61 numFitDists = 0;
62 numRefDists = 0;
63 numFitSeqs = 0;
64
65 fitPercent = 0;
66 refWeightMethod = "none";
67
68 square = false;
69
70 if (refformat == "name") { refnamefile = nc; refcountfile = ""; }
71 else if (refformat == "count") { refcountfile = nc; refnamefile = ""; }
72 else { refcountfile = ""; refnamefile = ""; }
73
74 if (fitformat == "name") { fitnamefile = fitnc; fitcountfile = ""; }
75 else if (fitformat == "count") { fitcountfile = fitnc; fitnamefile = ""; }
76 else { fitcountfile = ""; fitnamefile = ""; }
77
78 readFiles(refdistfile, refnamefile, refcountfile, refformat, refdistformat, fitdistfile, fitnamefile, fitcountfile, fitformat, fitdistformat, betweendistfile, betweendistformat);
79 }
80 /***********************************************************************/
81 //Since we are extracting a subset of the seqs some reads that may not have been singletons
extractRefMatrix()82 OptiData* OptiRefMatrix::extractRefMatrix() {
83 try {
84 set<long long> seqs; for (long long i = 0; i < isRef.size(); i++) { if (isRef[i]) { seqs.insert(i); } }
85
86 vector<string> subsetNameMap;
87 vector<string> subsetSingletons;
88 vector< set<long long> > subsetCloseness;
89 map<long long, long long> thisNameMap;
90 map<long long, long long> nonSingletonNameMap;
91 vector<bool> singleton; singleton.resize(seqs.size(), true);
92 int count = 0;
93
94 for (set<long long>::iterator it = seqs.begin(); it != seqs.end(); it++) {
95 long long seqNum = *it;
96 thisNameMap[seqNum] = count;
97 nonSingletonNameMap[count] = seqNum;
98
99 set<long long> thisSeqsCloseSeqs = getCloseSeqs(seqNum);
100 for (set<long long>::iterator itClose = thisSeqsCloseSeqs.begin(); itClose != thisSeqsCloseSeqs.end(); itClose++) {
101
102 if (m->getControl_pressed()) { break; }
103
104 long long thisSeq = *itClose;
105
106 //is this seq in the set of unfitted?
107 if (seqs.count(thisSeq) != 0) { singleton[thisNameMap[seqNum]] = false; }
108 }
109 count++;
110 }
111
112 int nonSingletonCount = 0;
113 for (long long i = 0; i < singleton.size(); i++) {
114 if (!singleton[i]) { //if you are not a singleton
115 nonSingletonNameMap[i] = nonSingletonCount;
116 nonSingletonCount++;
117 }else { seqs.erase(nonSingletonNameMap[i]); subsetSingletons.push_back(getName(nonSingletonNameMap[i])); } //remove from unfitted
118 }
119 singleton.clear();
120
121 subsetCloseness.resize(nonSingletonCount);
122 for (set<long long>::iterator it = seqs.begin(); it != seqs.end(); it++) {
123
124 if (m->getControl_pressed()) { break; }
125
126 long long seqNum = *it;
127
128 set<long long> thisSeqsCloseSeqs = getCloseSeqs(seqNum);
129 set<long long> thisSeqsCloseUnFittedSeqs;
130 for (set<long long>::iterator itClose = thisSeqsCloseSeqs.begin(); itClose != thisSeqsCloseSeqs.end(); itClose++) {
131
132 if (m->getControl_pressed()) { break; }
133
134 long long thisSeq = *itClose;
135
136 //is this seq in the set of unfitted?
137 if (seqs.count(thisSeq) != 0) { thisSeqsCloseUnFittedSeqs.insert(nonSingletonNameMap[thisNameMap[thisSeq]]); }
138 }
139
140 if (!thisSeqsCloseUnFittedSeqs.empty()) {
141 subsetCloseness[nonSingletonNameMap[thisNameMap[seqNum]]] = thisSeqsCloseUnFittedSeqs;
142 subsetNameMap.push_back(getName(seqNum));
143 }
144
145 }
146
147 for (int i = 0; i < isSingleRef.size(); i++) { if (isSingleRef[i]) { subsetSingletons.push_back(singletons[i]); } }
148
149 OptiData* unfittedMatrix = new OptiMatrix(subsetCloseness, subsetNameMap, subsetSingletons, cutoff);
150
151 return unfittedMatrix;
152 }
153 catch(exception& e) {
154 m->errorOut(e, "OptiRefMatrix", "extractRefMatrix");
155 exit(1);
156 }
157 }
158 /***********************************************************************/
159 //given set of names, pull out their dists and create optimatrix
extractMatrixSubset(set<string> & seqs)160 OptiData* OptiRefMatrix::extractMatrixSubset(set<string>& seqs) {
161 try {
162
163 set<long long> seqIndexes = getIndexes(seqs);
164 return extractMatrixSubset(seqIndexes);
165 }
166 catch(exception& e) {
167 m->errorOut(e, "OptiRefMatrix", "extractMatrixSubset");
168 exit(1);
169 }
170 }
171 /***********************************************************************/
172 //given matrix indexes of seqs, pull out their dists and create optimatrix
extractMatrixSubset(set<long long> & seqs)173 OptiData* OptiRefMatrix::extractMatrixSubset(set<long long> & seqs) {
174 try {
175 vector<string> subsetNameMap;
176 vector<string> subsetSingletons;
177 vector< set<long long> > subsetCloseness;
178 map<long long, long long> thisNameMap;
179 map<long long, long long> nonSingletonNameMap;
180 vector<bool> singleton; singleton.resize(seqs.size(), true);
181 int count = 0;
182
183 for (set<long long>::iterator it = seqs.begin(); it != seqs.end(); it++) {
184
185 long long seqNum = *it;
186 thisNameMap[seqNum] = count;
187 nonSingletonNameMap[count] = seqNum;
188
189 set<long long> thisSeqsCloseSeqs = getCloseSeqs(seqNum);
190 for (set<long long>::iterator itClose = thisSeqsCloseSeqs.begin(); itClose != thisSeqsCloseSeqs.end(); itClose++) {
191
192 if (m->getControl_pressed()) { break; }
193
194 long long thisSeq = *itClose;
195
196 //is this seq in the set of unfitted?
197 if (seqs.count(thisSeq) != 0) { singleton[thisNameMap[seqNum]] = false; }
198 }
199 count++;
200 }
201
202 int nonSingletonCount = 0;
203 for (long long i = 0; i < singleton.size(); i++) {
204 if (!singleton[i]) { //if you are a singleton
205 nonSingletonNameMap[i] = nonSingletonCount;
206 nonSingletonCount++;
207 }else { seqs.erase(nonSingletonNameMap[i]); subsetSingletons.push_back(getName(nonSingletonNameMap[i])); } //remove from unfitted
208 }
209 singleton.clear();
210
211 subsetCloseness.resize(nonSingletonCount);
212 for (set<long long>::iterator it = seqs.begin(); it != seqs.end(); it++) {
213
214 if (m->getControl_pressed()) { break; }
215
216 long long seqNum = *it;
217
218 set<long long> thisSeqsCloseSeqs = getCloseSeqs(seqNum);
219 set<long long> thisSeqsCloseUnFittedSeqs;
220 for (set<long long>::iterator itClose = thisSeqsCloseSeqs.begin(); itClose != thisSeqsCloseSeqs.end(); itClose++) {
221
222 if (m->getControl_pressed()) { break; }
223
224 long long thisSeq = *itClose;
225
226 //is this seq in the set of unfitted?
227 if (seqs.count(thisSeq) != 0) { thisSeqsCloseUnFittedSeqs.insert(nonSingletonNameMap[thisNameMap[thisSeq]]); }
228 }
229
230 if (!thisSeqsCloseUnFittedSeqs.empty()) {
231 subsetCloseness[nonSingletonNameMap[thisNameMap[seqNum]]] = thisSeqsCloseUnFittedSeqs;
232 subsetNameMap.push_back(getName(seqNum));
233 }
234
235 }
236
237 OptiData* unfittedMatrix = new OptiMatrix(subsetCloseness, subsetNameMap, subsetSingletons, cutoff);
238
239 return unfittedMatrix;
240 }
241 catch(exception& e) {
242 m->errorOut(e, "OptiRefMatrix", "extractMatrixSubset");
243 exit(1);
244 }
245 }
246 /***********************************************************************/
getTranslatedBins(vector<vector<string>> & binNames,vector<vector<long long>> & fixedBins)247 vector<long long> OptiRefMatrix::getTranslatedBins(vector<vector<string> > & binNames, vector<vector<long long> > & fixedBins) {
248 try {
249 fixedBins.clear();
250
251 map<string, long long> nameIndexes;
252 set<string> unique;
253 for (long long i = 0; i < nameMap.size(); i++) { //vector of string representing the sequences in the matrix from the name file.
254 vector<string> thisSeqsReps; util.splitAtComma(nameMap[i], thisSeqsReps); //split redundant names
255 if (i < closeness.size()) { nameIndexes[thisSeqsReps[0]] = i; } //this is a sequence with distances in the matrix
256 if (thisSeqsReps.size() == 1) { //you are unique
257 unique.insert(thisSeqsReps[0]);
258 }
259 }
260
261 for (long long i = 0; i < singletons.size(); i++) {
262 if (isSingleRef[i]) {
263 vector<string> thisSeqsReps; util.splitAtComma(singletons[i], thisSeqsReps); //split redundant names
264 nameIndexes[thisSeqsReps[0]] = -1;
265 if (thisSeqsReps.size() == 1) { unique.insert(thisSeqsReps[0]); }
266 }
267 }
268
269 for (long long i = 0; i < binNames.size(); i++) { //for each OTU
270 vector<long long> thisBinsSeqs;
271 for (long long j = 0; j < binNames[i].size(); j++) { //for each sequence
272 map<string, long long>::iterator it = nameIndexes.find(binNames[i][j]);
273
274 if (it == nameIndexes.end()) { }//not in distance matrix, but needs a value in fixedBins. 2 reasons for making it here: you are a redundant name in the listfile, you do not have any distances
275 else { thisBinsSeqs.push_back(it->second); } //"name" of sequence in matrix
276 }
277 fixedBins.push_back(thisBinsSeqs);
278 }
279
280 return (getFitSeqs());
281 }
282 catch(exception& e) {
283 m->errorOut(e, "OptiRefMatrix", "getTranslatedBins");
284 exit(1);
285 }
286 }
287 /***********************************************************************/
288 //assumes that i is a fitSeq
isCloseFit(long long i,long long toFind,bool & isFit)289 bool OptiRefMatrix::isCloseFit(long long i, long long toFind, bool& isFit){
290 try {
291 if (i < 0) { return false; }
292 else if (i > closeness.size()) { m->mothurOut("[ERROR]: index is not valid.\n"); m->setControl_pressed(true); return false; }
293
294 bool found = false;
295 if (!isRef[toFind]) { //are you a fit seq
296 if (closeness[i].count(toFind) != 0) { //are you close
297 found = true;
298 }
299 isFit = true;
300 }else { isFit = false; }
301 return found;
302 }
303 catch(exception& e) {
304 m->errorOut(e, "OptiRefMatrix", "isCloseFit");
305 exit(1);
306 }
307 }
308 /***********************************************************************/
309 //does not include singletons, only reads in closeness
getRefSeqs()310 vector<long long> OptiRefMatrix::getRefSeqs() {
311 try {
312 vector<long long> refSeqsIndexes;
313 for (long long i = 0; i < isRef.size(); i++) {
314 if (isRef[i]) { refSeqsIndexes.push_back(i); }
315 }
316 return refSeqsIndexes;
317 }
318 catch(exception& e) {
319 m->errorOut(e, "OptiRefMatrix", "getRefSeqs");
320 exit(1);
321 }
322 }
323 /***********************************************************************/
getRefSingletonNames()324 vector<string> OptiRefMatrix::getRefSingletonNames() {
325 try {
326 vector<string> refSeqsNames;
327
328 for (long long i = 0; i < isSingleRef.size(); i++) {
329 if (isSingleRef[i]) { refSeqsNames.push_back(singletons[i]); }
330 }
331
332 return refSeqsNames;
333 }
334 catch(exception& e) {
335 m->errorOut(e, "OptiRefMatrix", "getRefSingletonNames");
336 exit(1);
337 }
338 }
339 /***********************************************************************/
getFitSeqs()340 vector<long long> OptiRefMatrix::getFitSeqs() {
341 try {
342 vector<long long> fitSeqsIndexes;
343 for (long long i = 0; i < isRef.size(); i++) {
344 if (!isRef[i]) { fitSeqsIndexes.push_back(i); }
345 }
346 return fitSeqsIndexes;
347 }
348 catch(exception& e) {
349 m->errorOut(e, "OptiRefMatrix", "getFitSeqs");
350 exit(1);
351 }
352 }
353 /***********************************************************************/
getNumFitTrueSingletons()354 long long OptiRefMatrix::getNumFitTrueSingletons() {
355 try {
356 long long numFitTrueSingletons = 0;
357
358 for (long long i = 0; i < isSingleRef.size(); i++) {
359 if (!isSingleRef[i]) { numFitTrueSingletons++; }
360 }
361
362 return numFitTrueSingletons;
363 }
364 catch(exception& e) {
365 m->errorOut(e, "OptiData", "getNumFitTrueSingletons");
366 exit(1);
367 }
368 }
369 /***********************************************************************/
getNumFitClose(long long index)370 long long OptiRefMatrix::getNumFitClose(long long index) {
371 try {
372 long long numClose = 0;
373
374 if (index < 0) { }
375 else if (index > closeness.size()) { m->mothurOut("[ERROR]: index is not valid.\n"); m->setControl_pressed(true); }
376 else {
377 //reference seqs all have indexes less than refEnd
378 for (set<long long>::iterator it = closeness[index].begin(); it != closeness[index].end(); it++) {
379 if (!isRef[*it]) { numClose++; } //you are a fit seq
380 }
381 }
382
383 return numClose;
384 }
385 catch(exception& e) {
386 m->errorOut(e, "OptiData", "getNumClose");
387 exit(1);
388 }
389 }
390 /***********************************************************************/
getNumRefClose(long long index)391 long long OptiRefMatrix::getNumRefClose(long long index) {
392 try {
393 long long numClose = 0;
394
395 if (index < 0) { }
396 else if (index > closeness.size()) { m->mothurOut("[ERROR]: index is not valid.\n"); m->setControl_pressed(true); }
397 else {
398 //reference seqs all have indexes less than refEnd
399 for (set<long long>::iterator it = closeness[index].begin(); it != closeness[index].end(); it++) {
400 if (isRef[*it]) { numClose++; } //you are a ref seq
401 }
402 }
403
404 return numClose;
405 }
406 catch(exception& e) {
407 m->errorOut(e, "OptiData", "getNumClose");
408 exit(1);
409 }
410 }
411 /***********************************************************************/
getCloseFitSeqs(long long index)412 set<long long> OptiRefMatrix::getCloseFitSeqs(long long index){
413 try {
414 set<long long> closeSeqs;
415
416 if (index < 0) { }
417 else if (index > closeness.size()) { m->mothurOut("[ERROR]: index is not valid.\n"); m->setControl_pressed(true); } //
418 else {
419 //reference seqs all have indexes less than refEnd
420 for (set<long long>::iterator it = closeness[index].begin(); it != closeness[index].end(); it++) {
421 if (!isRef[*it]) { closeSeqs.insert(*it); } //you are a fit seq
422 }
423 }
424
425 return closeSeqs;
426 }
427 catch(exception& e) {
428 m->errorOut(e, "OptiRefMatrix", "getCloseFitSeqs");
429 exit(1);
430 }
431 }
432 /***********************************************************************/
getCloseRefSeqs(long long index)433 set<long long> OptiRefMatrix::getCloseRefSeqs(long long index){
434 try {
435 set<long long> closeSeqs;
436
437 if (index < 0) { }
438 else if (index > closeness.size()) { m->mothurOut("[ERROR]: index is not valid.\n"); m->setControl_pressed(true); }
439 else {
440 //reference seqs all have indexes less than refEnd
441 for (set<long long>::iterator it = closeness[index].begin(); it != closeness[index].end(); it++) {
442 if (isRef[*it]) { closeSeqs.insert(*it); } //you are a ref seq
443 }
444 }
445
446 return closeSeqs;
447 }
448 catch(exception& e) {
449 m->errorOut(e, "OptiRefMatrix", "getCloseFitSeqs");
450 exit(1);
451 }
452 }
453 /***********************************************************************/
454 //only used in open reference clustering
getFitListSingle()455 ListVector* OptiRefMatrix::getFitListSingle() {
456 try {
457 ListVector* singlelist = NULL;
458
459 if (singletons.size() == 0) { }
460 else {
461 singlelist = new ListVector();
462
463 for (int i = 0; i < isSingleRef.size(); i++) {
464 if (!isSingleRef[i]) {
465 singlelist->push_back(singletons[i]); }
466 }
467 }
468
469 return singlelist;
470 }
471 catch(exception& e) {
472 m->errorOut(e, "OptiRefMatrix", "getFitListSingle");
473 exit(1);
474 }
475 }
476 /***********************************************************************/
randomizeRefs()477 void OptiRefMatrix::randomizeRefs() {
478 try {
479 long long totalSeqs = (isRef.size()+isSingleRef.size());
480 long long numToSelect = totalSeqs * fitPercent;
481 long long refSingletonCutoff = isRef.size();
482 long long singleSize = isSingleRef.size();
483
484 //select sequences to be reference
485 set<long long> fitSeqsIndexes;
486 if (weights.size() != 0) { fitSeqsIndexes = subsample.getWeightedSample(weights, numToSelect); } //you have weighted selection
487 else {
488 long long numSelected = 0;
489 while (numSelected < numToSelect) {
490 if (m->getControl_pressed()) { break; }
491 fitSeqsIndexes.insert(util.getRandomIndex(totalSeqs-1)); //no repeats
492 numSelected = fitSeqsIndexes.size();
493 }
494 }
495
496 //initilize isRef to true
497 isRef.clear(); isRef.resize(refSingletonCutoff, true);
498 isSingleRef.clear(); isSingleRef.resize(singleSize, true);
499
500 //set isRef values
501 for (set<long long>::iterator it = fitSeqsIndexes.begin(); it != fitSeqsIndexes.end(); it++) {
502 if (m->getControl_pressed()) { break; }
503
504 long long thisSeq = *it;
505 if (thisSeq < refSingletonCutoff) { //you are a non singleton seq in the closeness
506 isRef[thisSeq] = false;
507 }else { //thisSeq is a singleton
508 isSingleRef[thisSeq-refSingletonCutoff] = false;
509 }
510 }
511
512 //find number of fitDists, refDists and between dists
513 calcCounts();
514 }
515 catch(exception& e) {
516 m->errorOut(e, "OptiRefMatrix", "randomizeRefs");
517 exit(1);
518 }
519
520 }
521 /***********************************************************************/
522 //for denovo method
readFiles(string distFile,string distFormat,string dupsFile,string dupsFormat,set<string> & optionalRefNames)523 int OptiRefMatrix::readFiles(string distFile, string distFormat, string dupsFile, string dupsFormat, set<string>& optionalRefNames) {
524 try {
525 string namefile, countfile;
526 if (dupsFormat == "name") { namefile = dupsFile; countfile = ""; }
527 else if (dupsFormat == "count") { countfile = dupsFile; namefile = ""; }
528 else { countfile = ""; namefile = ""; }
529
530 map<string, long long> nameAssignment;
531 if (namefile != "") { util.readNames(namefile, nameAssignment); }
532 else {
533 CountTable ct; ct.readTable(countfile, false, true);
534 map<string, int> temp = ct.getNameMap();
535 for (map<string, int>::iterator it = temp.begin(); it!= temp.end(); it++) { nameAssignment[it->first] = it->second; }
536 }
537
538 //select sequences to be reference
539 set<long long> fitSeqsIndexes;
540 long long count = 0;
541 for (map<string, long long>::iterator it = nameAssignment.begin(); it!= nameAssignment.end(); it++) {
542 if (refWeightMethod == "abundance") { weights[count] = it->second; }
543 else if (refWeightMethod == "connectivity") { weights[count] = 1; } //initialize
544 else if (refWeightMethod == "accnos") { //fill fit indexes
545 if (optionalRefNames.count(it->first) == 0) { //you are not a reference sequence
546 fitSeqsIndexes.insert(count); //add as fit seq
547 }
548 }
549 it->second = count; count++;
550 nameMap.push_back(it->first);
551 nameAssignment[it->first] = it->second;
552 }
553
554 //read file to find singletons
555 vector<bool> singleton; singleton.resize(count, true);
556 map<long long, long long> singletonIndexSwap;
557
558 if (distFormat == "column") { singletonIndexSwap = readColumnSingletons(singleton, distFile, nameAssignment); }
559 else if (distFormat == "phylip") { singletonIndexSwap = readPhylipSingletons(singleton, distFile, count, nameAssignment); }
560
561 int nonSingletonCount = 0;
562 for (int i = 0; i < singleton.size(); i++) {
563 if (!singleton[i]) { //if you are not a singleton
564 singletonIndexSwap[i] = nonSingletonCount;
565 nonSingletonCount++;
566 }else { singletons.push_back(nameMap[i]); }
567 }
568 numSingletons = singletons.size();
569 closeness.resize(nonSingletonCount);
570
571 map<string, string> names;
572 if (namefile != "") {
573 //update names for reference
574 util.readNames(namefile, names);
575 for (int i = 0; i < numSingletons; i++) {
576 singletons[i] = names[singletons[i]];
577 }
578 }
579
580 //read reference file distances
581 bool hasName = false;
582 if (namefile != "") { hasName = true; }
583 if (distFormat == "column") { readColumn(distFile, hasName, names, nameAssignment, singletonIndexSwap); }
584 else if (distFormat == "phylip") { readPhylip(distFile, hasName, names, nameAssignment, singletonIndexSwap); }
585
586
587 //randomly select the "fit" seqs
588 long long numToSelect = nameAssignment.size() * fitPercent;
589 if (weights.size() != 0) { fitSeqsIndexes = subsample.getWeightedSample(weights, numToSelect); } //you have weighted selection
590 else {
591 if (refWeightMethod == "accnos") { } //fitIndexes are filled above
592 else { //randomly select references
593 long long numSelected = 0;
594 long long totalSeqs = nameAssignment.size();
595 while (numSelected < numToSelect) {
596 if (m->getControl_pressed()) { break; }
597 fitSeqsIndexes.insert(util.getRandomIndex(totalSeqs-1)); //no repeats
598 numSelected = fitSeqsIndexes.size();
599 }
600 }
601 }
602
603 //flag reference seqs singleton or not
604 for (int i = 0; i < singleton.size(); i++) {
605 if (!singleton[i]) { //if you are not a singleton
606
607 if (fitSeqsIndexes.count(i) != 0) { //you are a fit seq
608 isRef.push_back(false);
609 }else { isRef.push_back(true); } //its a reference
610 }else {
611 if (fitSeqsIndexes.count(i) != 0) { //you are a fit seq singleton
612 isSingleRef.push_back(false);
613 }else { isSingleRef.push_back(true); } //its a singleton reference
614 }
615 }
616 singleton.clear();
617
618 //find number of fitDists, refDists and between dists
619 calcCounts();
620
621 return 0;
622 }
623 catch(exception& e) {
624 m->errorOut(e, "OptiRefMatrix", "readFiles");
625 exit(1);
626 }
627 }
628 /***********************************************************************/
629 //for reading reference and fit files separately, reference method
readFiles(string refdistfile,string refnamefile,string refcountfile,string refformat,string refdistformat,string fitdistfile,string fitnamefile,string fitcountfile,string fitformat,string fitdistformat,string betweendistfile,string betweendistformat)630 int OptiRefMatrix::readFiles(string refdistfile, string refnamefile, string refcountfile, string refformat, string refdistformat, string fitdistfile, string fitnamefile, string fitcountfile, string fitformat, string fitdistformat, string betweendistfile, string betweendistformat){
631 try {
632 map<string, long long> nameAssignment;
633 if (refnamefile != "") { util.readNames(refnamefile, nameAssignment); }
634 else {
635 CountTable ct; ct.readTable(refcountfile, false, true);
636 map<string, int> temp = ct.getNameMap();
637 for (map<string, int>::iterator it = temp.begin(); it!= temp.end(); it++) { nameAssignment[it->first] = it->second; }
638 }
639
640 long long count = 0;
641 for (map<string, long long>::iterator it = nameAssignment.begin(); it!= nameAssignment.end(); it++) {
642 it->second = count; count++;
643 nameMap.push_back(it->first);
644 nameAssignment[it->first] = it->second;
645 }
646
647 long long refCount = count;
648 vector<bool> singleton; singleton.resize(count, true); //resize will only set new elements to true
649 map<long long, long long> refSingletonIndexSwap; //index into
650 if (refdistformat == "column") { refSingletonIndexSwap = readColumnSingletons(singleton, refdistfile, nameAssignment); }
651 else if (refdistformat == "phylip") { refSingletonIndexSwap = readPhylipSingletons(singleton, refdistfile, count, nameAssignment); }
652
653 //read fit file to find singletons
654 map<long long, long long> fitSingletonIndexSwap;
655 map<string, long long> fitnameAssignment;
656 if (fitnamefile != "") { util.readNames(fitnamefile, fitnameAssignment); }
657 else {
658 CountTable ct; ct.readTable(fitcountfile, false, true);
659 map<string, int> temp = ct.getNameMap();
660 for (map<string, int>::iterator it = temp.begin(); it!= temp.end(); it++) { fitnameAssignment[it->first] = it->second; }
661 }
662
663 for (map<string, long long>::iterator it = fitnameAssignment.begin(); it!= fitnameAssignment.end(); it++) {
664 it->second = count; count++;
665 nameMap.push_back(it->first);
666 nameAssignment[it->first] = it->second;
667 }
668
669 singleton.resize(count, true);
670 if (fitdistformat == "column") { fitSingletonIndexSwap = readColumnSingletons(singleton, fitdistfile, nameAssignment); }
671 else if (fitdistformat == "phylip") { fitSingletonIndexSwap = readPhylipSingletons(singleton, fitdistfile, count, nameAssignment); }
672
673 fitPercent = ((count-refCount) / (float) count);
674
675 //read bewtween file to update singletons
676 readColumnSingletons(singleton, betweendistfile, nameAssignment);
677
678 long long nonSingletonCount = 0;
679 map<long long, long long> singletonIndexSwap;
680 for (long long i = 0; i < refCount; i++) {
681 if (!singleton[i]) { //if you are not a singleton
682 singletonIndexSwap[i] = nonSingletonCount;
683 isRef.push_back(true);
684 nonSingletonCount++;
685 }else {
686 singletons.push_back(nameMap[i]);
687 isSingleRef.push_back(true);
688 }
689 }
690 refSingletonIndexSwap.clear();
691
692 for (long long i = refCount; i < singleton.size(); i++) {
693 if (!singleton[i]) { //if you are not a singleton
694 singletonIndexSwap[i] = nonSingletonCount;
695 isRef.push_back(false);
696 nonSingletonCount++;
697 }else {
698 singletons.push_back(nameMap[i]);
699 isSingleRef.push_back(false);
700 }
701 }
702 singleton.clear();
703 fitSingletonIndexSwap.clear();
704
705 numSingletons = singletons.size();
706 closeness.resize(nonSingletonCount);
707
708 map<string, string> names;
709 if (refnamefile != "") { util.readNames(refnamefile, names); }
710
711 if (fitnamefile != "") {
712 map<string, string> fitnames;
713 util.readNames(fitnamefile, fitnames);
714
715 names.insert(fitnames.begin(), fitnames.end()); //copy fit names into names
716 }
717
718 if ((fitnamefile != "") || (refnamefile != "")) {
719 for (int i = 0; i < singletons.size(); i++) {
720 map<string, string>::iterator it = names.find(singletons[i]);
721 if (it != names.end()) { //update singletons
722 singletons[i] = it->second;
723 }
724 }
725 }
726
727 //read reference file distances
728 bool refHasName = false;
729 if (refnamefile != "") { refHasName = true; }
730 if (refdistformat == "column") { readColumn(refdistfile, refHasName, names, nameAssignment, singletonIndexSwap); }
731 else if (refdistformat == "phylip") { readPhylip(refdistfile, refHasName, names, nameAssignment, singletonIndexSwap); }
732
733
734 //read fit distances
735 bool fitHasName = false;
736 if (fitnamefile != "") { fitHasName = true; }
737 if (fitdistformat == "column") { readColumn(fitdistfile, fitHasName, names, nameAssignment, singletonIndexSwap); }
738 else if (fitdistformat == "phylip") { readPhylip(fitdistfile, fitHasName, names, nameAssignment, singletonIndexSwap); }
739
740
741 //read in between distances
742 bool hasName = fitHasName;
743 if (!hasName && refHasName) { hasName = true; } //if either the ref or fit has a name file then set hasName
744 if (betweendistformat == "column") { readColumn(betweendistfile, hasName, names, nameAssignment, singletonIndexSwap); }
745 else if (betweendistformat == "phylip") { readPhylip(betweendistfile, hasName, names, nameAssignment, singletonIndexSwap); }
746
747 //find number of fitDists, refDists and between dists
748 calcCounts();
749
750 return 0;
751 }
752 catch(exception& e) {
753 m->errorOut(e, "OptiRefMatrix", "readFiles");
754 exit(1);
755 }
756 }
757 /***********************************************************************/
readColumnSingletons(vector<bool> & singleton,string distFile,map<string,long long> & nameAssignment)758 map<long long, long long> OptiRefMatrix::readColumnSingletons(vector<bool>& singleton, string distFile, map<string, long long>& nameAssignment){
759 try {
760
761 ifstream fileHandle; util.openInputFile(distFile, fileHandle);
762
763 string firstName, secondName;
764 double distance;
765 map<long long, long long> singletonIndexSwap;
766
767 while(fileHandle){ //let's assume it's a triangular matrix...
768
769 fileHandle >> firstName; util.gobble(fileHandle);
770 fileHandle >> secondName; util.gobble(fileHandle);
771 fileHandle >> distance; util.gobble(fileHandle); // get the row and column names and distance
772
773 if (m->getDebug()) { cout << firstName << '\t' << secondName << '\t' << distance << endl; }
774
775 if (m->getControl_pressed()) { break; }
776
777 if (util.isEqual(distance,-1)) { distance = 1000000; }
778
779 if(distance <= cutoff){
780 map<string,long long>::iterator itA = nameAssignment.find(firstName);
781 map<string,long long>::iterator itB = nameAssignment.find(secondName);
782
783 if(itA == nameAssignment.end()){ m->mothurOut("AAError: Sequence '" + firstName + "' was not found in the name or count file, please correct\n"); exit(1); }
784 if(itB == nameAssignment.end()){ m->mothurOut("ABError: Sequence '" + secondName + "' was not found in the name or count file, please correct\n"); exit(1); }
785
786 long long indexA = (itA->second);
787 long long indexB = (itB->second);
788 singleton[indexA] = false;
789 singleton[indexB] = false;
790 singletonIndexSwap[indexA] = indexA;
791 singletonIndexSwap[indexB] = indexB;
792 }
793 }
794 fileHandle.close();
795
796 return singletonIndexSwap;
797 }
798 catch(exception& e) {
799 m->errorOut(e, "OptiRefMatrix", "readColumnSingletons");
800 exit(1);
801 }
802 }
803 /***********************************************************************/
804
readPhylipSingletons(vector<bool> & singleton,string distFile,long long & count,map<string,long long> & nameAssignment)805 map<long long, long long> OptiRefMatrix::readPhylipSingletons(vector<bool>& singleton, string distFile, long long& count, map<string, long long>& nameAssignment){
806 try {
807 float distance;
808 long long nseqs;
809 string name;
810 map<long long, long long> singletonIndexSwap;
811
812 ifstream fileHandle;
813 string numTest;
814
815 util.openInputFile(distFile, fileHandle);
816 fileHandle >> numTest >> name;
817 nameMap.push_back(name);
818 singletonIndexSwap[0] = 0;
819 nameAssignment[name] = 0;
820
821 if (!util.isContainingOnlyDigits(numTest)) { m->mothurOut("[ERROR]: expected a number and got " + numTest + ", quitting.\n"); m->setControl_pressed(true); return singletonIndexSwap; }
822 else { convert(numTest, nseqs); }
823
824 //square test
825 char d;
826 while((d=fileHandle.get()) != EOF){
827 if(isalnum(d)){ square = true; fileHandle.putback(d); for(int i=0;i<nseqs;i++){ fileHandle >> distance; } break; }
828 if(d == '\n'){ square = false; break; }
829 }
830
831 singleton.resize((count+nseqs), true);
832 if(square == 0){
833
834 for(long long i=1;i<nseqs;i++){
835 if (m->getControl_pressed()) { break; }
836
837 fileHandle >> name; nameMap.push_back(name); singletonIndexSwap[i] = i; nameAssignment[name] = i;
838
839 for(long long j=0;j<i;j++){
840
841 fileHandle >> distance;
842
843 if (util.isEqual(distance,-1)) { distance = 1000000; }
844
845 if(distance <= cutoff){
846 singleton[i] = false;
847 singleton[j] = false;
848 }
849 }
850 }
851 }else{
852 for(long long i=1;i<nseqs;i++){
853 if (m->getControl_pressed()) { break; }
854
855 fileHandle >> name; nameMap.push_back(name); singletonIndexSwap[i] = i; nameAssignment[name] = i;
856
857 for(long long j=0;j<nseqs;j++){
858 fileHandle >> distance;
859
860 if (util.isEqual(distance,-1)) { distance = 1000000; }
861
862 if(distance <= cutoff && j < i){
863 singleton[i] = false;
864 singleton[j] = false;
865 }
866 }
867 }
868 }
869 fileHandle.close();
870
871 count += nseqs;
872
873 return singletonIndexSwap;
874 }
875 catch(exception& e) {
876 m->errorOut(e, "OptiRefMatrix", "readPhylipSingletons");
877 exit(1);
878 }
879 }
880 /***********************************************************************/
readPhylip(string distFile,bool hasName,map<string,string> & names,map<string,long long> & nameAssignment,map<long long,long long> & singletonIndexSwap)881 int OptiRefMatrix::readPhylip(string distFile, bool hasName, map<string, string>& names, map<string, long long>& nameAssignment, map<long long, long long>& singletonIndexSwap){
882 try {
883 long long nseqs;
884 string name;
885 double distance;
886
887 ifstream in; string numTest;
888 util.openInputFile(distFile, in);
889
890 in >> numTest >> name;
891
892 if (hasName) { name = names[name]; } //redundant names
893 nameMap[singletonIndexSwap[0]] = name;
894
895
896 if (!util.isContainingOnlyDigits(numTest)) { m->mothurOut("[ERROR]: expected a number and got " + numTest + ", quitting.\n"); m->setControl_pressed(true); return 0; }
897 else { convert(numTest, nseqs); }
898
899 //square test
900 char d;
901 while((d=in.get()) != EOF){
902 if(isalnum(d)){ square = true; in.putback(d); for(int i=0;i<nseqs;i++){ in >> distance; } break; }
903 if(d == '\n'){ square = false; break; }
904 }
905
906 string line = "";
907 if(!square){
908
909 for(long long i=1;i<nseqs;i++){
910
911 if (m->getControl_pressed()) { break; }
912
913 in >> name; util.gobble(in);
914
915 if (hasName) { name = names[name]; } //redundant names
916 nameMap[singletonIndexSwap[i]] = name;
917
918 for(long long j=0;j<i;j++){
919
920 in >> distance; util.gobble(in);
921
922 if (util.isEqual(distance,-1)) { distance = 1000000; }
923
924 if(distance <= cutoff){
925 if (refWeightMethod == "connectivity") { //count dists
926 weights[i]++; weights[j]++;
927 }
928 long long newB = singletonIndexSwap[j];
929 long long newA = singletonIndexSwap[i];
930 closeness[newA].insert(newB);
931 closeness[newB].insert(newA);
932 }
933 }
934 }
935 }else{
936 for(long long i=0;i<nseqs;i++){ in >> distance; } util.gobble(in);
937
938 for(long long i=1;i<nseqs;i++){
939 if (m->getControl_pressed()) { break; }
940
941 in >> name; util.gobble(in);
942
943 if (hasName) { name = names[name]; } //redundant names
944 nameMap[singletonIndexSwap[i]] = name;
945
946 for(long long j=0;j<nseqs;j++){
947 in >> distance; util.gobble(in);
948
949 if (util.isEqual(distance,-1)) { distance = 1000000; }
950
951 if(distance <= cutoff && j < i){
952 if (refWeightMethod == "connectivity") { //count dists
953 weights[i]++; weights[j]++;
954 }
955 long long newB = singletonIndexSwap[j];
956 long long newA = singletonIndexSwap[i];
957 closeness[newA].insert(newB);
958 closeness[newB].insert(newA);
959 }
960 }
961 }
962 }
963 in.close();
964
965 return 0;
966 }
967 catch(exception& e) {
968 m->errorOut(e, "OptiRefMatrix", "readPhylip");
969 exit(1);
970 }
971 }
972 /***********************************************************************/
973
readColumn(string distFile,bool hasName,map<string,string> & names,map<string,long long> & nameAssignment,map<long long,long long> & singletonIndexSwap)974 int OptiRefMatrix::readColumn(string distFile, bool hasName, map<string, string>& names, map<string, long long>& nameAssignment, map<long long, long long>& singletonIndexSwap){
975 try {
976 string firstName, secondName;
977 double distance;
978
979 ifstream in; util.openInputFile(distFile, in);
980
981 while(in){ //let's assume it's a triangular matrix...
982
983 in >> firstName; util.gobble(in);
984 in >> secondName; util.gobble(in);
985 in >> distance; util.gobble(in); // get the row and column names and distance
986
987 if (m->getDebug()) { cout << firstName << '\t' << secondName << '\t' << distance << endl; }
988
989 if (m->getControl_pressed()) { in.close(); return 0; }
990
991 if (util.isEqual(distance,-1)) { distance = 1000000; }
992
993 if(distance <= cutoff){
994 map<string,long long>::iterator itA = nameAssignment.find(firstName);
995 map<string,long long>::iterator itB = nameAssignment.find(secondName);
996
997 if(itA == nameAssignment.end()){ m->mothurOut("AAError: Sequence '" + firstName + "' was not found in the name or count file, please correct\n"); exit(1); }
998 if(itB == nameAssignment.end()){ m->mothurOut("ABError: Sequence '" + secondName + "' was not found in the name or count file, please correct\n"); exit(1); }
999
1000 long long indexA = (itA->second);
1001 long long indexB = (itB->second);
1002
1003 if (refWeightMethod == "connectivity") { //count dists
1004 weights[indexA]++; weights[indexB]++;
1005 }
1006
1007 long long newB = singletonIndexSwap[indexB];
1008 long long newA = singletonIndexSwap[indexA];
1009 closeness[newA].insert(newB);
1010 closeness[newB].insert(newA);
1011
1012 if (hasName) {
1013 map<string, string>::iterator itName1 = names.find(firstName);
1014 map<string, string>::iterator itName2 = names.find(secondName);
1015
1016 if (itName1 != names.end()) { firstName = itName1->second; } //redundant names
1017 if (itName2 != names.end()) { secondName = itName2->second; } //redundant names
1018 }
1019
1020 nameMap[newA] = firstName;
1021 nameMap[newB] = secondName;
1022 }
1023 }
1024 in.close();
1025
1026 return 1;
1027
1028 }
1029 catch(exception& e) {
1030 m->errorOut(e, "OptiRefMatrix", "readColumn");
1031 exit(1);
1032 }
1033 }
1034 /***********************************************************************/
1035
calcCounts()1036 void OptiRefMatrix::calcCounts(){
1037 try {
1038 //find number of fitDists, refDists and between dists
1039 numRefDists = 0;
1040 numFitDists = 0;
1041 numBetweenDists = 0;
1042 numFitSingletons = 0;
1043 numFitSeqs = 0;
1044 numRefSingletons = 0;
1045
1046 for (long long i = 0; i < closeness.size(); i++) {
1047 if (m->getControl_pressed()) { break; }
1048
1049 bool thisSeqIsRef = isRef[i];
1050 long long thisSeqsNumRefDists = 0;
1051 long long thisSeqsNumFitDists = 0;
1052
1053 for (set<long long>::iterator it = closeness[i].begin(); it != closeness[i].end(); it++) {
1054 long long newB = *it;
1055
1056 if ((thisSeqIsRef) && (isRef[newB])) { thisSeqsNumRefDists++; } //both refs
1057 else if ((thisSeqIsRef) && (!isRef[newB])) { numBetweenDists++; } // ref to fit dist
1058 else if ((!thisSeqIsRef) && (isRef[newB])) { numBetweenDists++; } // fit to ref dist
1059 else if ((!thisSeqIsRef) && (!isRef[newB])) { thisSeqsNumFitDists++; } // both fit
1060 }
1061
1062 //a refSingleton or Fitsingleton may not be a true singleton (no valid dists in matrix), but may be a refSeq with no distances to other refs but distances to fitseqs. a fitsingleton may have dists to refs but no dists to other fitseqs.
1063
1064 //you are a ref with no refdists, so you are a refsingleton
1065 if ((thisSeqIsRef) && (thisSeqsNumRefDists == 0)) { numRefSingletons++; }
1066 else if ((!thisSeqIsRef) && (thisSeqsNumFitDists == 0)) { numFitSingletons++; }
1067 else if ((!thisSeqIsRef) && (thisSeqsNumFitDists != 0)) { numFitSeqs++; }
1068
1069 numRefDists += thisSeqsNumRefDists;
1070 numFitDists += thisSeqsNumFitDists;
1071 }
1072
1073 //counted twice
1074 numRefDists /= 2;
1075 numFitDists /= 2;
1076 numBetweenDists /= 2;
1077 }
1078 catch(exception& e) {
1079 m->errorOut(e, "OptiRefMatrix", "calcCounts");
1080 exit(1);
1081 }
1082 }
1083 /***********************************************************************/
1084
1085
1086