1 //
2 //  optirefmatrix.cpp
3 //  Mothur
4 //
5 //  Created by Sarah Westcott on 5/3/18.
6 //  Copyright © 2018 Schloss Lab. All rights reserved.
7 //
8 
9 #include "optirefmatrix.hpp"
10 #include "counttable.h"
11 
12 /***********************************************************************/
OptiRefMatrix(string distFile,string distFormat,string dupsFile,string dupsFormat,double c,float fP,string refWeight)13 OptiRefMatrix::OptiRefMatrix(string distFile, string distFormat, string dupsFile, string dupsFormat, double c, float fP, string refWeight) : OptiData(c) {
14 
15     numFitSingletons = 0;
16     numRefSingletons = 0;
17     numSingletons = 0;
18     numBetweenDists = 0;
19     numFitDists = 0;
20     numRefDists = 0;
21     numFitSeqs = 0;
22     refWeightMethod = refWeight;
23 
24     fitPercent = fP / 100.0;
25     if (fitPercent < 0.001) { fitPercent = 0.10; m->mothurOut("[WARNING]: fit percentage must be between 0.001 (0.1%) and 1.0 (100%). Setting to 0.10 or 10%. \n"); } //minumum of 0.1%
26     else if (fitPercent > 100.0) {  m->mothurOut("[ERROR]: fit percentage must be between 0.0001 and 100.0\n"); m->setControl_pressed(true); }
27 
28     square = false;
29 
30     set<string> noRefNamesSet;
31     readFiles(distFile, distFormat, dupsFile, dupsFormat, noRefNamesSet);
32 }
33 /***********************************************************************/
OptiRefMatrix(string distFile,string distFormat,string dupsFile,string dupsFormat,double c,set<string> accnosRefFileNames)34 OptiRefMatrix::OptiRefMatrix(string distFile, string distFormat, string dupsFile, string dupsFormat, double c, set<string> accnosRefFileNames) : OptiData(c) {
35 
36     numFitSingletons = 0;
37     numRefSingletons = 0;
38     numSingletons = 0;
39     numBetweenDists = 0;
40     numFitDists = 0;
41     numRefDists = 0;
42     numFitSeqs = 0;
43     refWeightMethod = "accnos";
44 
45     square = false;
46 
47     readFiles(distFile, distFormat, dupsFile, dupsFormat, accnosRefFileNames);
48 }
49 
50 /***********************************************************************/
OptiRefMatrix(string d,string nc,string f,string df,double c,string fit,string fitnc,string fitf,string fitdf,string betweend,string betweendf)51 OptiRefMatrix::OptiRefMatrix(string d, string nc, string f, string df, double c, string fit, string fitnc, string fitf, string fitdf, string betweend, string betweendf) : OptiData(c) {
52 
53     string refdistfile, refnamefile, refcountfile, refformat, refdistformat, fitdistfile, fitnamefile, fitcountfile, fitformat, fitdistformat, betweendistfile, betweendistformat;
54 
55     refdistfile = d; refdistformat = df; refformat = f; fitdistfile = fit; fitdistformat = fitdf; fitformat = fitf; betweendistfile = betweend; betweendistformat = betweendf;
56 
57     numFitSingletons = 0;
58     numRefSingletons = 0;
59     numSingletons = 0;
60     numBetweenDists = 0;
61     numFitDists = 0;
62     numRefDists = 0;
63     numFitSeqs = 0;
64 
65     fitPercent = 0;
66     refWeightMethod = "none";
67 
68     square = false;
69 
70     if (refformat == "name") { refnamefile = nc; refcountfile = ""; }
71     else if (refformat == "count") { refcountfile = nc; refnamefile = ""; }
72     else { refcountfile = ""; refnamefile = ""; }
73 
74     if (fitformat == "name") { fitnamefile = fitnc; fitcountfile = ""; }
75     else if (fitformat == "count") { fitcountfile = fitnc; fitnamefile = ""; }
76     else { fitcountfile = ""; fitnamefile = ""; }
77 
78     readFiles(refdistfile, refnamefile, refcountfile, refformat, refdistformat, fitdistfile, fitnamefile, fitcountfile, fitformat, fitdistformat, betweendistfile, betweendistformat);
79 }
80 /***********************************************************************/
81 //Since we are extracting a subset of the seqs some reads that may not have been singletons
extractRefMatrix()82 OptiData* OptiRefMatrix::extractRefMatrix() {
83     try {
84         set<long long> seqs; for (long long i = 0; i < isRef.size(); i++) { if (isRef[i]) { seqs.insert(i); } }
85 
86         vector<string> subsetNameMap;
87         vector<string> subsetSingletons;
88         vector< set<long long> > subsetCloseness;
89         map<long long, long long> thisNameMap;
90         map<long long, long long> nonSingletonNameMap;
91         vector<bool> singleton; singleton.resize(seqs.size(), true);
92         int count = 0;
93 
94         for (set<long long>::iterator it = seqs.begin(); it != seqs.end(); it++) {
95             long long seqNum = *it;
96             thisNameMap[seqNum] = count;
97             nonSingletonNameMap[count] = seqNum;
98 
99             set<long long> thisSeqsCloseSeqs = getCloseSeqs(seqNum);
100             for (set<long long>::iterator itClose = thisSeqsCloseSeqs.begin(); itClose != thisSeqsCloseSeqs.end(); itClose++) {
101 
102                 if (m->getControl_pressed()) { break; }
103 
104                 long long thisSeq = *itClose;
105 
106                 //is this seq in the set of unfitted?
107                 if (seqs.count(thisSeq) != 0) { singleton[thisNameMap[seqNum]] = false; }
108             }
109             count++;
110         }
111 
112         int nonSingletonCount = 0;
113         for (long long i = 0; i < singleton.size(); i++) {
114             if (!singleton[i]) { //if you are not a singleton
115                 nonSingletonNameMap[i] = nonSingletonCount;
116                 nonSingletonCount++;
117             }else { seqs.erase(nonSingletonNameMap[i]);  subsetSingletons.push_back(getName(nonSingletonNameMap[i])); } //remove from unfitted
118         }
119         singleton.clear();
120 
121         subsetCloseness.resize(nonSingletonCount);
122         for (set<long long>::iterator it = seqs.begin(); it != seqs.end(); it++) {
123 
124             if (m->getControl_pressed()) { break; }
125 
126             long long seqNum = *it;
127 
128             set<long long> thisSeqsCloseSeqs = getCloseSeqs(seqNum);
129             set<long long> thisSeqsCloseUnFittedSeqs;
130             for (set<long long>::iterator itClose = thisSeqsCloseSeqs.begin(); itClose != thisSeqsCloseSeqs.end(); itClose++) {
131 
132                 if (m->getControl_pressed()) { break; }
133 
134                 long long thisSeq = *itClose;
135 
136                 //is this seq in the set of unfitted?
137                 if (seqs.count(thisSeq) != 0) { thisSeqsCloseUnFittedSeqs.insert(nonSingletonNameMap[thisNameMap[thisSeq]]); }
138             }
139 
140             if (!thisSeqsCloseUnFittedSeqs.empty()) {
141                 subsetCloseness[nonSingletonNameMap[thisNameMap[seqNum]]] = thisSeqsCloseUnFittedSeqs;
142                 subsetNameMap.push_back(getName(seqNum));
143             }
144 
145         }
146 
147         for (int i = 0; i < isSingleRef.size(); i++) { if (isSingleRef[i]) { subsetSingletons.push_back(singletons[i]); } }
148 
149         OptiData* unfittedMatrix = new OptiMatrix(subsetCloseness, subsetNameMap, subsetSingletons, cutoff);
150 
151         return unfittedMatrix;
152     }
153     catch(exception& e) {
154         m->errorOut(e, "OptiRefMatrix", "extractRefMatrix");
155         exit(1);
156     }
157 }
158 /***********************************************************************/
159 //given set of names, pull out their dists and create optimatrix
extractMatrixSubset(set<string> & seqs)160 OptiData* OptiRefMatrix::extractMatrixSubset(set<string>& seqs) {
161     try {
162 
163         set<long long> seqIndexes = getIndexes(seqs);
164         return extractMatrixSubset(seqIndexes);
165     }
166     catch(exception& e) {
167         m->errorOut(e, "OptiRefMatrix", "extractMatrixSubset");
168         exit(1);
169     }
170 }
171 /***********************************************************************/
172 //given matrix indexes of seqs, pull out their dists and create optimatrix
extractMatrixSubset(set<long long> & seqs)173 OptiData* OptiRefMatrix::extractMatrixSubset(set<long long> & seqs) {
174     try {
175         vector<string> subsetNameMap;
176         vector<string> subsetSingletons;
177         vector< set<long long> > subsetCloseness;
178         map<long long, long long> thisNameMap;
179         map<long long, long long> nonSingletonNameMap;
180         vector<bool> singleton; singleton.resize(seqs.size(), true);
181         int count = 0;
182 
183         for (set<long long>::iterator it = seqs.begin(); it != seqs.end(); it++) {
184 
185             long long seqNum = *it;
186             thisNameMap[seqNum] = count;
187             nonSingletonNameMap[count] = seqNum;
188 
189             set<long long> thisSeqsCloseSeqs = getCloseSeqs(seqNum);
190             for (set<long long>::iterator itClose = thisSeqsCloseSeqs.begin(); itClose != thisSeqsCloseSeqs.end(); itClose++) {
191 
192                 if (m->getControl_pressed()) { break; }
193 
194                 long long thisSeq = *itClose;
195 
196                 //is this seq in the set of unfitted?
197                 if (seqs.count(thisSeq) != 0) { singleton[thisNameMap[seqNum]] = false; }
198             }
199             count++;
200         }
201 
202         int nonSingletonCount = 0;
203         for (long long i = 0; i < singleton.size(); i++) {
204             if (!singleton[i]) { //if you are a singleton
205                 nonSingletonNameMap[i] = nonSingletonCount;
206                 nonSingletonCount++;
207             }else { seqs.erase(nonSingletonNameMap[i]);  subsetSingletons.push_back(getName(nonSingletonNameMap[i])); } //remove from unfitted
208         }
209         singleton.clear();
210 
211         subsetCloseness.resize(nonSingletonCount);
212         for (set<long long>::iterator it = seqs.begin(); it != seqs.end(); it++) {
213 
214             if (m->getControl_pressed()) { break; }
215 
216             long long seqNum = *it;
217 
218             set<long long> thisSeqsCloseSeqs = getCloseSeqs(seqNum);
219             set<long long> thisSeqsCloseUnFittedSeqs;
220             for (set<long long>::iterator itClose = thisSeqsCloseSeqs.begin(); itClose != thisSeqsCloseSeqs.end(); itClose++) {
221 
222                 if (m->getControl_pressed()) { break; }
223 
224                 long long thisSeq = *itClose;
225 
226                 //is this seq in the set of unfitted?
227                 if (seqs.count(thisSeq) != 0) { thisSeqsCloseUnFittedSeqs.insert(nonSingletonNameMap[thisNameMap[thisSeq]]); }
228             }
229 
230             if (!thisSeqsCloseUnFittedSeqs.empty()) {
231                 subsetCloseness[nonSingletonNameMap[thisNameMap[seqNum]]] = thisSeqsCloseUnFittedSeqs;
232                 subsetNameMap.push_back(getName(seqNum));
233             }
234 
235         }
236 
237         OptiData* unfittedMatrix = new OptiMatrix(subsetCloseness, subsetNameMap, subsetSingletons, cutoff);
238 
239         return unfittedMatrix;
240     }
241     catch(exception& e) {
242         m->errorOut(e, "OptiRefMatrix", "extractMatrixSubset");
243         exit(1);
244     }
245 }
246 /***********************************************************************/
getTranslatedBins(vector<vector<string>> & binNames,vector<vector<long long>> & fixedBins)247 vector<long long> OptiRefMatrix::getTranslatedBins(vector<vector<string> > & binNames, vector<vector<long long> > & fixedBins) {
248     try {
249         fixedBins.clear();
250 
251         map<string, long long> nameIndexes;
252         set<string> unique;
253         for (long long i = 0; i < nameMap.size(); i++) { //vector of string representing the sequences in the matrix from the name file.
254             vector<string> thisSeqsReps; util.splitAtComma(nameMap[i], thisSeqsReps); //split redundant names
255             if (i < closeness.size()) {  nameIndexes[thisSeqsReps[0]] = i;  } //this is a sequence with distances in the matrix
256             if (thisSeqsReps.size() == 1) { //you are unique
257                 unique.insert(thisSeqsReps[0]);
258             }
259         }
260 
261         for (long long i = 0; i < singletons.size(); i++) {
262             if (isSingleRef[i]) {
263                 vector<string> thisSeqsReps; util.splitAtComma(singletons[i], thisSeqsReps); //split redundant names
264                 nameIndexes[thisSeqsReps[0]] = -1;
265                 if (thisSeqsReps.size() == 1) { unique.insert(thisSeqsReps[0]); }
266             }
267         }
268 
269         for (long long i = 0; i < binNames.size(); i++) { //for each OTU
270             vector<long long> thisBinsSeqs;
271             for (long long j = 0; j < binNames[i].size(); j++) { //for each sequence
272                 map<string, long long>::iterator it = nameIndexes.find(binNames[i][j]);
273 
274                 if (it == nameIndexes.end()) { }//not in distance matrix, but needs a value in fixedBins. 2 reasons for making it here: you are a redundant name in the listfile, you do not have any distances
275                 else { thisBinsSeqs.push_back(it->second);  } //"name" of sequence in matrix
276             }
277             fixedBins.push_back(thisBinsSeqs);
278         }
279 
280         return (getFitSeqs());
281     }
282     catch(exception& e) {
283         m->errorOut(e, "OptiRefMatrix", "getTranslatedBins");
284         exit(1);
285     }
286 }
287 /***********************************************************************/
288 //assumes that i is a fitSeq
isCloseFit(long long i,long long toFind,bool & isFit)289 bool OptiRefMatrix::isCloseFit(long long i, long long toFind, bool& isFit){
290     try {
291         if (i < 0) { return false; }
292         else if (i > closeness.size()) { m->mothurOut("[ERROR]: index is not valid.\n"); m->setControl_pressed(true); return false; }
293 
294         bool found = false;
295         if (!isRef[toFind]) { //are you a fit seq
296             if (closeness[i].count(toFind) != 0) {  //are you close
297                 found = true;
298             }
299             isFit = true;
300         }else { isFit = false;  }
301         return found;
302     }
303     catch(exception& e) {
304         m->errorOut(e, "OptiRefMatrix", "isCloseFit");
305         exit(1);
306     }
307 }
308 /***********************************************************************/
309 //does not include singletons, only reads in closeness
getRefSeqs()310 vector<long long> OptiRefMatrix::getRefSeqs() {
311     try {
312         vector<long long> refSeqsIndexes;
313         for (long long i = 0; i < isRef.size(); i++) {
314             if (isRef[i]) { refSeqsIndexes.push_back(i); }
315         }
316         return refSeqsIndexes;
317     }
318     catch(exception& e) {
319         m->errorOut(e, "OptiRefMatrix", "getRefSeqs");
320         exit(1);
321     }
322 }
323 /***********************************************************************/
getRefSingletonNames()324 vector<string> OptiRefMatrix::getRefSingletonNames() {
325     try {
326         vector<string> refSeqsNames;
327 
328         for (long long i = 0; i < isSingleRef.size(); i++) {
329             if (isSingleRef[i]) { refSeqsNames.push_back(singletons[i]); }
330         }
331 
332         return refSeqsNames;
333     }
334     catch(exception& e) {
335         m->errorOut(e, "OptiRefMatrix", "getRefSingletonNames");
336         exit(1);
337     }
338 }
339 /***********************************************************************/
getFitSeqs()340 vector<long long> OptiRefMatrix::getFitSeqs() {
341     try {
342         vector<long long> fitSeqsIndexes;
343         for (long long i = 0; i < isRef.size(); i++) {
344             if (!isRef[i]) { fitSeqsIndexes.push_back(i);  }
345         }
346         return fitSeqsIndexes;
347     }
348     catch(exception& e) {
349         m->errorOut(e, "OptiRefMatrix", "getFitSeqs");
350         exit(1);
351     }
352 }
353 /***********************************************************************/
getNumFitTrueSingletons()354 long long OptiRefMatrix::getNumFitTrueSingletons() {
355     try {
356         long long numFitTrueSingletons = 0;
357 
358         for (long long i = 0; i < isSingleRef.size(); i++) {
359             if (!isSingleRef[i]) { numFitTrueSingletons++; }
360         }
361 
362         return numFitTrueSingletons;
363     }
364     catch(exception& e) {
365         m->errorOut(e, "OptiData", "getNumFitTrueSingletons");
366         exit(1);
367     }
368 }
369 /***********************************************************************/
getNumFitClose(long long index)370 long long OptiRefMatrix::getNumFitClose(long long index) {
371     try {
372         long long numClose = 0;
373 
374         if (index < 0) { }
375         else if (index > closeness.size()) { m->mothurOut("[ERROR]: index is not valid.\n"); m->setControl_pressed(true);  }
376         else {
377             //reference seqs all have indexes less than refEnd
378             for (set<long long>::iterator it = closeness[index].begin(); it != closeness[index].end(); it++) {
379                 if (!isRef[*it]) {  numClose++; } //you are a fit seq
380             }
381         }
382 
383         return numClose;
384     }
385     catch(exception& e) {
386         m->errorOut(e, "OptiData", "getNumClose");
387         exit(1);
388     }
389 }
390 /***********************************************************************/
getNumRefClose(long long index)391 long long OptiRefMatrix::getNumRefClose(long long index) {
392     try {
393         long long numClose = 0;
394 
395         if (index < 0) { }
396         else if (index > closeness.size()) { m->mothurOut("[ERROR]: index is not valid.\n"); m->setControl_pressed(true);  }
397         else {
398             //reference seqs all have indexes less than refEnd
399             for (set<long long>::iterator it = closeness[index].begin(); it != closeness[index].end(); it++) {
400                 if (isRef[*it]) {  numClose++; } //you are a ref seq
401             }
402         }
403 
404         return numClose;
405     }
406     catch(exception& e) {
407         m->errorOut(e, "OptiData", "getNumClose");
408         exit(1);
409     }
410 }
411 /***********************************************************************/
getCloseFitSeqs(long long index)412 set<long long> OptiRefMatrix::getCloseFitSeqs(long long index){
413     try {
414         set<long long> closeSeqs;
415 
416         if (index < 0) { }
417         else if (index > closeness.size()) { m->mothurOut("[ERROR]: index is not valid.\n"); m->setControl_pressed(true);  } //
418         else {
419             //reference seqs all have indexes less than refEnd
420             for (set<long long>::iterator it = closeness[index].begin(); it != closeness[index].end(); it++) {
421                 if (!isRef[*it]) {  closeSeqs.insert(*it); } //you are a fit seq
422             }
423         }
424 
425         return closeSeqs;
426     }
427     catch(exception& e) {
428         m->errorOut(e, "OptiRefMatrix", "getCloseFitSeqs");
429         exit(1);
430     }
431 }
432 /***********************************************************************/
getCloseRefSeqs(long long index)433 set<long long> OptiRefMatrix::getCloseRefSeqs(long long index){
434     try {
435         set<long long> closeSeqs;
436 
437         if (index < 0) { }
438         else if (index > closeness.size()) { m->mothurOut("[ERROR]: index is not valid.\n"); m->setControl_pressed(true);  }
439         else {
440             //reference seqs all have indexes less than refEnd
441             for (set<long long>::iterator it = closeness[index].begin(); it != closeness[index].end(); it++) {
442                 if (isRef[*it]) { closeSeqs.insert(*it); } //you are a ref seq
443             }
444         }
445 
446         return closeSeqs;
447     }
448     catch(exception& e) {
449         m->errorOut(e, "OptiRefMatrix", "getCloseFitSeqs");
450         exit(1);
451     }
452 }
453 /***********************************************************************/
454 //only used in open reference clustering
getFitListSingle()455 ListVector* OptiRefMatrix::getFitListSingle() {
456     try {
457         ListVector* singlelist = NULL;
458 
459         if (singletons.size() == 0) { }
460         else {
461             singlelist = new ListVector();
462 
463             for (int i = 0; i < isSingleRef.size(); i++) {
464                 if (!isSingleRef[i]) {
465                     singlelist->push_back(singletons[i]); }
466             }
467         }
468 
469         return singlelist;
470     }
471     catch(exception& e) {
472         m->errorOut(e, "OptiRefMatrix", "getFitListSingle");
473         exit(1);
474     }
475 }
476 /***********************************************************************/
randomizeRefs()477 void OptiRefMatrix::randomizeRefs() {
478     try {
479         long long totalSeqs = (isRef.size()+isSingleRef.size());
480         long long numToSelect = totalSeqs * fitPercent;
481         long long refSingletonCutoff = isRef.size();
482         long long singleSize = isSingleRef.size();
483 
484         //select sequences to be reference
485         set<long long> fitSeqsIndexes;
486         if (weights.size() != 0) {  fitSeqsIndexes = subsample.getWeightedSample(weights, numToSelect);  } //you have weighted selection
487         else {
488             long long numSelected = 0;
489             while (numSelected < numToSelect) {
490                 if (m->getControl_pressed()) { break; }
491                 fitSeqsIndexes.insert(util.getRandomIndex(totalSeqs-1)); //no repeats
492                 numSelected = fitSeqsIndexes.size();
493             }
494         }
495 
496         //initilize isRef to true
497         isRef.clear(); isRef.resize(refSingletonCutoff, true);
498         isSingleRef.clear(); isSingleRef.resize(singleSize, true);
499 
500         //set isRef values
501         for (set<long long>::iterator it = fitSeqsIndexes.begin(); it != fitSeqsIndexes.end(); it++) {
502             if (m->getControl_pressed()) { break; }
503 
504             long long thisSeq = *it;
505             if (thisSeq < refSingletonCutoff) { //you are a non singleton seq in the closeness
506                 isRef[thisSeq] = false;
507             }else { //thisSeq is a singleton
508                 isSingleRef[thisSeq-refSingletonCutoff] = false;
509             }
510         }
511 
512         //find number of fitDists, refDists and between dists
513         calcCounts();
514     }
515     catch(exception& e) {
516         m->errorOut(e, "OptiRefMatrix", "randomizeRefs");
517         exit(1);
518     }
519 
520 }
521 /***********************************************************************/
522 //for denovo method
readFiles(string distFile,string distFormat,string dupsFile,string dupsFormat,set<string> & optionalRefNames)523 int OptiRefMatrix::readFiles(string distFile, string distFormat, string dupsFile, string dupsFormat, set<string>& optionalRefNames) {
524     try {
525         string namefile, countfile;
526         if (dupsFormat == "name") { namefile = dupsFile; countfile = ""; }
527         else if (dupsFormat == "count") { countfile = dupsFile; namefile = ""; }
528         else { countfile = ""; namefile = ""; }
529 
530         map<string, long long> nameAssignment;
531         if (namefile != "") { util.readNames(namefile, nameAssignment); }
532         else  {
533             CountTable ct; ct.readTable(countfile, false, true);
534             map<string, int> temp = ct.getNameMap();
535             for (map<string, int>::iterator it = temp.begin(); it!= temp.end(); it++) {  nameAssignment[it->first] = it->second; }
536         }
537 
538         //select sequences to be reference
539         set<long long> fitSeqsIndexes;
540         long long count = 0;
541         for (map<string, long long>::iterator it = nameAssignment.begin(); it!= nameAssignment.end(); it++) {
542             if (refWeightMethod == "abundance")          { weights[count] = it->second; }
543             else if (refWeightMethod == "connectivity")  { weights[count] = 1;          } //initialize
544             else if (refWeightMethod == "accnos") { //fill fit indexes
545                 if (optionalRefNames.count(it->first) == 0) { //you are not a reference sequence
546                     fitSeqsIndexes.insert(count); //add as fit seq
547                 }
548             }
549             it->second = count; count++;
550             nameMap.push_back(it->first);
551             nameAssignment[it->first] = it->second;
552         }
553 
554         //read file to find singletons
555         vector<bool> singleton; singleton.resize(count, true);
556         map<long long, long long> singletonIndexSwap;
557 
558         if (distFormat == "column")        {  singletonIndexSwap = readColumnSingletons(singleton, distFile, nameAssignment);           }
559         else if (distFormat == "phylip")   {  singletonIndexSwap = readPhylipSingletons(singleton, distFile, count, nameAssignment);    }
560 
561         int nonSingletonCount = 0;
562         for (int i = 0; i < singleton.size(); i++) {
563             if (!singleton[i]) { //if you are not a singleton
564                 singletonIndexSwap[i] = nonSingletonCount;
565                 nonSingletonCount++;
566             }else { singletons.push_back(nameMap[i]); }
567         }
568         numSingletons = singletons.size();
569         closeness.resize(nonSingletonCount);
570 
571         map<string, string> names;
572         if (namefile != "") {
573             //update names for reference
574             util.readNames(namefile, names);
575             for (int i = 0; i < numSingletons; i++) {
576                 singletons[i] = names[singletons[i]];
577             }
578         }
579 
580         //read reference file distances
581         bool hasName = false;
582         if (namefile != "") { hasName = true; }
583         if (distFormat == "column")        {  readColumn(distFile, hasName, names, nameAssignment, singletonIndexSwap);     }
584         else if (distFormat == "phylip")   {  readPhylip(distFile, hasName, names, nameAssignment, singletonIndexSwap);     }
585 
586 
587         //randomly select the "fit" seqs
588         long long numToSelect = nameAssignment.size() * fitPercent;
589         if (weights.size() != 0) {  fitSeqsIndexes = subsample.getWeightedSample(weights, numToSelect);  } //you have weighted selection
590         else {
591             if (refWeightMethod == "accnos") { } //fitIndexes are filled above
592             else { //randomly select references
593                 long long numSelected = 0;
594                 long long totalSeqs = nameAssignment.size();
595                 while (numSelected < numToSelect) {
596                     if (m->getControl_pressed()) { break; }
597                     fitSeqsIndexes.insert(util.getRandomIndex(totalSeqs-1)); //no repeats
598                     numSelected = fitSeqsIndexes.size();
599                 }
600             }
601         }
602 
603         //flag reference seqs singleton or not
604         for (int i = 0; i < singleton.size(); i++) {
605             if (!singleton[i]) { //if you are not a singleton
606 
607                 if (fitSeqsIndexes.count(i) != 0) { //you are a fit seq
608                     isRef.push_back(false);
609                 }else { isRef.push_back(true);  } //its a reference
610             }else {
611                 if (fitSeqsIndexes.count(i) != 0) { //you are a fit seq singleton
612                     isSingleRef.push_back(false);
613                 }else { isSingleRef.push_back(true); } //its a singleton reference
614             }
615         }
616         singleton.clear();
617 
618         //find number of fitDists, refDists and between dists
619         calcCounts();
620 
621         return 0;
622     }
623     catch(exception& e) {
624         m->errorOut(e, "OptiRefMatrix", "readFiles");
625         exit(1);
626     }
627 }
628 /***********************************************************************/
629 //for reading reference and fit files separately, reference method
readFiles(string refdistfile,string refnamefile,string refcountfile,string refformat,string refdistformat,string fitdistfile,string fitnamefile,string fitcountfile,string fitformat,string fitdistformat,string betweendistfile,string betweendistformat)630 int OptiRefMatrix::readFiles(string refdistfile, string refnamefile, string refcountfile, string refformat, string refdistformat, string fitdistfile, string fitnamefile, string fitcountfile, string fitformat, string fitdistformat, string betweendistfile, string betweendistformat){
631     try {
632         map<string, long long> nameAssignment;
633         if (refnamefile != "") { util.readNames(refnamefile, nameAssignment); }
634         else  {
635             CountTable ct; ct.readTable(refcountfile, false, true);
636             map<string, int> temp = ct.getNameMap();
637             for (map<string, int>::iterator it = temp.begin(); it!= temp.end(); it++) {  nameAssignment[it->first] = it->second; }
638         }
639 
640         long long count = 0;
641         for (map<string, long long>::iterator it = nameAssignment.begin(); it!= nameAssignment.end(); it++) {
642             it->second = count; count++;
643             nameMap.push_back(it->first);
644             nameAssignment[it->first] = it->second;
645         }
646 
647         long long refCount = count;
648         vector<bool> singleton; singleton.resize(count, true); //resize will only set new elements to true
649         map<long long, long long> refSingletonIndexSwap; //index into
650         if (refdistformat == "column")        {  refSingletonIndexSwap = readColumnSingletons(singleton, refdistfile, nameAssignment);          }
651         else if (refdistformat == "phylip")   {  refSingletonIndexSwap = readPhylipSingletons(singleton, refdistfile, count, nameAssignment);   }
652 
653         //read fit file to find singletons
654         map<long long, long long> fitSingletonIndexSwap;
655         map<string, long long> fitnameAssignment;
656         if (fitnamefile != "") { util.readNames(fitnamefile, fitnameAssignment); }
657         else  {
658             CountTable ct; ct.readTable(fitcountfile, false, true);
659             map<string, int> temp = ct.getNameMap();
660             for (map<string, int>::iterator it = temp.begin(); it!= temp.end(); it++) {  fitnameAssignment[it->first] = it->second; }
661         }
662 
663         for (map<string, long long>::iterator it = fitnameAssignment.begin(); it!= fitnameAssignment.end(); it++) {
664             it->second = count; count++;
665             nameMap.push_back(it->first);
666             nameAssignment[it->first] = it->second;
667         }
668 
669         singleton.resize(count, true);
670         if (fitdistformat == "column")        {  fitSingletonIndexSwap = readColumnSingletons(singleton, fitdistfile, nameAssignment);          }
671         else if (fitdistformat == "phylip")   {  fitSingletonIndexSwap = readPhylipSingletons(singleton, fitdistfile, count, nameAssignment);   }
672 
673         fitPercent = ((count-refCount) / (float) count);
674 
675         //read bewtween file to update singletons
676         readColumnSingletons(singleton, betweendistfile, nameAssignment);
677 
678         long long nonSingletonCount = 0;
679         map<long long, long long> singletonIndexSwap;
680         for (long long i = 0; i < refCount; i++) {
681             if (!singleton[i]) { //if you are not a singleton
682                 singletonIndexSwap[i] = nonSingletonCount;
683                 isRef.push_back(true);
684                 nonSingletonCount++;
685             }else {
686                 singletons.push_back(nameMap[i]);
687                 isSingleRef.push_back(true);
688             }
689         }
690         refSingletonIndexSwap.clear();
691 
692         for (long long i = refCount; i < singleton.size(); i++) {
693             if (!singleton[i]) { //if you are not a singleton
694                 singletonIndexSwap[i] = nonSingletonCount;
695                 isRef.push_back(false);
696                 nonSingletonCount++;
697             }else {
698                 singletons.push_back(nameMap[i]);
699                 isSingleRef.push_back(false);
700             }
701         }
702         singleton.clear();
703         fitSingletonIndexSwap.clear();
704 
705         numSingletons = singletons.size();
706         closeness.resize(nonSingletonCount);
707 
708         map<string, string> names;
709         if (refnamefile != "") { util.readNames(refnamefile, names); }
710 
711         if (fitnamefile != "") {
712             map<string, string> fitnames;
713             util.readNames(fitnamefile, fitnames);
714 
715             names.insert(fitnames.begin(), fitnames.end()); //copy fit names into names
716         }
717 
718         if ((fitnamefile != "") || (refnamefile != "")) {
719             for (int i = 0; i < singletons.size(); i++) {
720                 map<string, string>::iterator it = names.find(singletons[i]);
721                 if (it != names.end()) { //update singletons
722                     singletons[i] = it->second;
723                 }
724             }
725         }
726 
727         //read reference file distances
728         bool refHasName = false;
729         if (refnamefile != "") { refHasName = true; }
730         if (refdistformat == "column")        {  readColumn(refdistfile, refHasName, names, nameAssignment, singletonIndexSwap);     }
731         else if (refdistformat == "phylip")   {  readPhylip(refdistfile, refHasName, names, nameAssignment, singletonIndexSwap);     }
732 
733 
734         //read fit distances
735         bool fitHasName = false;
736         if (fitnamefile != "") { fitHasName = true; }
737         if (fitdistformat == "column")        {  readColumn(fitdistfile, fitHasName, names, nameAssignment, singletonIndexSwap);     }
738         else if (fitdistformat == "phylip")   {  readPhylip(fitdistfile, fitHasName, names, nameAssignment, singletonIndexSwap);     }
739 
740 
741         //read in between distances
742         bool hasName = fitHasName;
743         if (!hasName && refHasName) { hasName = true; } //if either the ref or fit has a name file then set hasName
744         if (betweendistformat == "column")        {  readColumn(betweendistfile, hasName, names, nameAssignment, singletonIndexSwap);     }
745         else if (betweendistformat == "phylip")   {  readPhylip(betweendistfile, hasName, names, nameAssignment, singletonIndexSwap);     }
746 
747         //find number of fitDists, refDists and between dists
748         calcCounts();
749 
750         return 0;
751     }
752     catch(exception& e) {
753         m->errorOut(e, "OptiRefMatrix", "readFiles");
754         exit(1);
755     }
756 }
757 /***********************************************************************/
readColumnSingletons(vector<bool> & singleton,string distFile,map<string,long long> & nameAssignment)758 map<long long, long long> OptiRefMatrix::readColumnSingletons(vector<bool>& singleton, string distFile, map<string, long long>& nameAssignment){
759     try {
760 
761         ifstream fileHandle; util.openInputFile(distFile, fileHandle);
762 
763         string firstName, secondName;
764         double distance;
765         map<long long, long long> singletonIndexSwap;
766 
767         while(fileHandle){  //let's assume it's a triangular matrix...
768 
769             fileHandle >> firstName; util.gobble(fileHandle);
770             fileHandle >> secondName; util.gobble(fileHandle);
771             fileHandle >> distance;	util.gobble(fileHandle); // get the row and column names and distance
772 
773             if (m->getDebug()) { cout << firstName << '\t' << secondName << '\t' << distance << endl; }
774 
775             if (m->getControl_pressed()) {  break; }
776 
777             if (util.isEqual(distance,-1)) { distance = 1000000; }
778 
779             if(distance <= cutoff){
780                 map<string,long long>::iterator itA = nameAssignment.find(firstName);
781                 map<string,long long>::iterator itB = nameAssignment.find(secondName);
782 
783                 if(itA == nameAssignment.end()){  m->mothurOut("AAError: Sequence '" + firstName + "' was not found in the name or count file, please correct\n"); exit(1);  }
784                 if(itB == nameAssignment.end()){  m->mothurOut("ABError: Sequence '" + secondName + "' was not found in the name or count file, please correct\n"); exit(1);  }
785 
786                 long long indexA = (itA->second);
787                 long long indexB = (itB->second);
788                 singleton[indexA] = false;
789                 singleton[indexB] = false;
790                 singletonIndexSwap[indexA] = indexA;
791                 singletonIndexSwap[indexB] = indexB;
792             }
793         }
794         fileHandle.close();
795 
796         return singletonIndexSwap;
797     }
798     catch(exception& e) {
799         m->errorOut(e, "OptiRefMatrix", "readColumnSingletons");
800         exit(1);
801     }
802 }
803 /***********************************************************************/
804 
readPhylipSingletons(vector<bool> & singleton,string distFile,long long & count,map<string,long long> & nameAssignment)805 map<long long, long long> OptiRefMatrix::readPhylipSingletons(vector<bool>& singleton, string distFile, long long& count, map<string, long long>& nameAssignment){
806     try {
807         float distance;
808         long long nseqs;
809         string name;
810         map<long long, long long> singletonIndexSwap;
811 
812         ifstream fileHandle;
813         string numTest;
814 
815         util.openInputFile(distFile, fileHandle);
816         fileHandle >> numTest >> name;
817         nameMap.push_back(name);
818         singletonIndexSwap[0] = 0;
819         nameAssignment[name] = 0;
820 
821         if (!util.isContainingOnlyDigits(numTest)) { m->mothurOut("[ERROR]: expected a number and got " + numTest + ", quitting.\n"); m->setControl_pressed(true); return singletonIndexSwap; }
822         else { convert(numTest, nseqs); }
823 
824         //square test
825         char d;
826         while((d=fileHandle.get()) != EOF){
827             if(isalnum(d)){ square = true; fileHandle.putback(d); for(int i=0;i<nseqs;i++){ fileHandle >> distance;  } break; }
828             if(d == '\n'){ square = false; break; }
829         }
830 
831         singleton.resize((count+nseqs), true);
832         if(square == 0){
833 
834             for(long long i=1;i<nseqs;i++){
835                 if (m->getControl_pressed()) {  break; }
836 
837                 fileHandle >> name; nameMap.push_back(name); singletonIndexSwap[i] = i;  nameAssignment[name] = i;
838 
839                 for(long long j=0;j<i;j++){
840 
841                     fileHandle >> distance;
842 
843                     if (util.isEqual(distance,-1)) { distance = 1000000; }
844 
845                     if(distance <= cutoff){
846                         singleton[i] = false;
847                         singleton[j] = false;
848                     }
849                 }
850             }
851         }else{
852             for(long long i=1;i<nseqs;i++){
853                 if (m->getControl_pressed()) {  break; }
854 
855                 fileHandle >> name; nameMap.push_back(name); singletonIndexSwap[i] = i; nameAssignment[name] = i;
856 
857                 for(long long j=0;j<nseqs;j++){
858                     fileHandle >> distance;
859 
860                     if (util.isEqual(distance,-1)) { distance = 1000000; }
861 
862                     if(distance <= cutoff && j < i){
863                         singleton[i] = false;
864                         singleton[j] = false;
865                     }
866                 }
867             }
868         }
869         fileHandle.close();
870 
871         count += nseqs;
872 
873         return singletonIndexSwap;
874     }
875     catch(exception& e) {
876         m->errorOut(e, "OptiRefMatrix", "readPhylipSingletons");
877         exit(1);
878     }
879 }
880 /***********************************************************************/
readPhylip(string distFile,bool hasName,map<string,string> & names,map<string,long long> & nameAssignment,map<long long,long long> & singletonIndexSwap)881 int OptiRefMatrix::readPhylip(string distFile, bool hasName, map<string, string>& names, map<string, long long>& nameAssignment, map<long long, long long>& singletonIndexSwap){
882     try {
883         long long nseqs;
884         string name;
885         double distance;
886 
887         ifstream in; string numTest;
888         util.openInputFile(distFile, in);
889 
890         in >> numTest >> name;
891 
892         if (hasName) { name = names[name]; } //redundant names
893         nameMap[singletonIndexSwap[0]] = name;
894 
895 
896         if (!util.isContainingOnlyDigits(numTest)) { m->mothurOut("[ERROR]: expected a number and got " + numTest + ", quitting.\n"); m->setControl_pressed(true); return 0; }
897         else { convert(numTest, nseqs); }
898 
899         //square test
900         char d;
901         while((d=in.get()) != EOF){
902             if(isalnum(d)){ square = true; in.putback(d); for(int i=0;i<nseqs;i++){ in >> distance;  } break; }
903             if(d == '\n'){ square = false; break; }
904         }
905 
906         string line = "";
907         if(!square){
908 
909             for(long long i=1;i<nseqs;i++){
910 
911                 if (m->getControl_pressed()) {  break; }
912 
913                 in >> name; util.gobble(in);
914 
915                 if (hasName) { name = names[name]; } //redundant names
916                 nameMap[singletonIndexSwap[i]] = name;
917 
918                 for(long long j=0;j<i;j++){
919 
920                     in >> distance; util.gobble(in);
921 
922                     if (util.isEqual(distance,-1)) { distance = 1000000; }
923 
924                     if(distance <= cutoff){
925                         if (refWeightMethod == "connectivity") { //count dists
926                             weights[i]++; weights[j]++;
927                         }
928                         long long newB = singletonIndexSwap[j];
929                         long long newA = singletonIndexSwap[i];
930                         closeness[newA].insert(newB);
931                         closeness[newB].insert(newA);
932                     }
933                 }
934             }
935         }else{
936             for(long long i=0;i<nseqs;i++){ in >> distance;  } util.gobble(in);
937 
938             for(long long i=1;i<nseqs;i++){
939                 if (m->getControl_pressed()) {  break; }
940 
941                 in >> name; util.gobble(in);
942 
943                 if (hasName) { name = names[name]; } //redundant names
944                 nameMap[singletonIndexSwap[i]] = name;
945 
946                 for(long long j=0;j<nseqs;j++){
947                     in >> distance; util.gobble(in);
948 
949                     if (util.isEqual(distance,-1)) { distance = 1000000; }
950 
951                     if(distance <= cutoff && j < i){
952                         if (refWeightMethod == "connectivity") { //count dists
953                             weights[i]++; weights[j]++;
954                         }
955                         long long newB = singletonIndexSwap[j];
956                         long long newA = singletonIndexSwap[i];
957                         closeness[newA].insert(newB);
958                         closeness[newB].insert(newA);
959                     }
960                 }
961             }
962         }
963         in.close();
964 
965         return 0;
966     }
967     catch(exception& e) {
968         m->errorOut(e, "OptiRefMatrix", "readPhylip");
969         exit(1);
970     }
971 }
972 /***********************************************************************/
973 
readColumn(string distFile,bool hasName,map<string,string> & names,map<string,long long> & nameAssignment,map<long long,long long> & singletonIndexSwap)974 int OptiRefMatrix::readColumn(string distFile, bool hasName, map<string, string>& names, map<string, long long>& nameAssignment, map<long long, long long>& singletonIndexSwap){
975     try {
976         string firstName, secondName;
977         double distance;
978 
979         ifstream in; util.openInputFile(distFile, in);
980 
981         while(in){  //let's assume it's a triangular matrix...
982 
983             in >> firstName; util.gobble(in);
984             in >> secondName; util.gobble(in);
985             in >> distance;	util.gobble(in); // get the row and column names and distance
986 
987             if (m->getDebug()) { cout << firstName << '\t' << secondName << '\t' << distance << endl; }
988 
989             if (m->getControl_pressed()) {  in.close();   return 0; }
990 
991             if (util.isEqual(distance,-1)) { distance = 1000000; }
992 
993             if(distance <= cutoff){
994                 map<string,long long>::iterator itA = nameAssignment.find(firstName);
995                 map<string,long long>::iterator itB = nameAssignment.find(secondName);
996 
997                 if(itA == nameAssignment.end()){  m->mothurOut("AAError: Sequence '" + firstName + "' was not found in the name or count file, please correct\n"); exit(1);  }
998                 if(itB == nameAssignment.end()){  m->mothurOut("ABError: Sequence '" + secondName + "' was not found in the name or count file, please correct\n"); exit(1);  }
999 
1000                 long long indexA = (itA->second);
1001                 long long indexB = (itB->second);
1002 
1003                 if (refWeightMethod == "connectivity") { //count dists
1004                     weights[indexA]++; weights[indexB]++;
1005                 }
1006 
1007                 long long newB = singletonIndexSwap[indexB];
1008                 long long newA = singletonIndexSwap[indexA];
1009                 closeness[newA].insert(newB);
1010                 closeness[newB].insert(newA);
1011 
1012                 if (hasName) {
1013                     map<string, string>::iterator itName1 = names.find(firstName);
1014                     map<string, string>::iterator itName2 = names.find(secondName);
1015 
1016                     if (itName1 != names.end()) { firstName = itName1->second;  } //redundant names
1017                     if (itName2 != names.end()) { secondName = itName2->second;  } //redundant names
1018                 }
1019 
1020                 nameMap[newA] = firstName;
1021                 nameMap[newB] = secondName;
1022             }
1023         }
1024         in.close();
1025 
1026         return 1;
1027 
1028     }
1029     catch(exception& e) {
1030         m->errorOut(e, "OptiRefMatrix", "readColumn");
1031         exit(1);
1032     }
1033 }
1034 /***********************************************************************/
1035 
calcCounts()1036 void OptiRefMatrix::calcCounts(){
1037     try {
1038         //find number of fitDists, refDists and between dists
1039         numRefDists = 0;
1040         numFitDists = 0;
1041         numBetweenDists = 0;
1042         numFitSingletons = 0;
1043         numFitSeqs = 0;
1044         numRefSingletons = 0;
1045 
1046         for (long long i = 0; i < closeness.size(); i++) {
1047             if (m->getControl_pressed()) { break; }
1048 
1049             bool thisSeqIsRef = isRef[i];
1050             long long thisSeqsNumRefDists = 0;
1051             long long thisSeqsNumFitDists = 0;
1052 
1053             for (set<long long>::iterator it = closeness[i].begin(); it != closeness[i].end(); it++) {
1054                 long long newB = *it;
1055 
1056                 if ((thisSeqIsRef) && (isRef[newB])) {  thisSeqsNumRefDists++; } //both refs
1057                 else if ((thisSeqIsRef) && (!isRef[newB])) { numBetweenDists++; } // ref to fit dist
1058                 else if ((!thisSeqIsRef) && (isRef[newB])) { numBetweenDists++; } // fit to ref dist
1059                 else if ((!thisSeqIsRef) && (!isRef[newB])) { thisSeqsNumFitDists++; } // both fit
1060             }
1061 
1062             //a refSingleton or Fitsingleton may not be a true singleton (no valid dists in matrix), but may be a refSeq with no distances to other refs but distances to fitseqs. a fitsingleton may have dists to refs but no dists to other fitseqs.
1063 
1064             //you are a ref with no refdists, so you are a refsingleton
1065             if ((thisSeqIsRef) && (thisSeqsNumRefDists == 0)) {  numRefSingletons++; }
1066             else if ((!thisSeqIsRef) && (thisSeqsNumFitDists == 0)) {  numFitSingletons++; }
1067             else if ((!thisSeqIsRef) && (thisSeqsNumFitDists != 0)) {  numFitSeqs++; }
1068 
1069             numRefDists += thisSeqsNumRefDists;
1070             numFitDists += thisSeqsNumFitDists;
1071         }
1072 
1073         //counted twice
1074         numRefDists /= 2;
1075         numFitDists /= 2;
1076         numBetweenDists /= 2;
1077     }
1078     catch(exception& e) {
1079         m->errorOut(e, "OptiRefMatrix", "calcCounts");
1080         exit(1);
1081     }
1082 }
1083 /***********************************************************************/
1084 
1085 
1086