1 //
2 //  opticluster.cpp
3 //  Mothur
4 //
5 //  Created by Sarah Westcott on 4/20/16.
6 //  Copyright (c) 2016 Schloss Lab. All rights reserved.
7 //
8 
9 #include "opticluster.h"
10 
11 /***********************************************************************/
12 //randomly assign sequences to OTUs
initialize(double & value,bool randomize,string initialize)13 int OptiCluster::initialize(double& value, bool randomize, string initialize) {
14     try {
15         numSeqs = matrix->getNumSeqs();
16         truePositives = 0;
17         falsePositives = 0;
18         falseNegatives = 0;
19         trueNegatives = 0;
20 
21         bins.resize(numSeqs); //place seqs in own bin
22 
23         vector<long long> temp;
24         bins.push_back(temp);
25         seqBin[numSeqs] = -1;
26         insertLocation = numSeqs;
27         Utils util;
28 
29         if (initialize == "singleton") {
30 
31             //put everyone in own bin
32             for (int i = 0; i < numSeqs; i++) { bins[i].push_back(i); }
33 
34             //maps randomized sequences to bins
35             for (int i = 0; i < numSeqs; i++) {
36                 seqBin[i] = bins[i][0];
37                 randomizeSeqs.push_back(i);
38             }
39 
40             if (randomize) { util.mothurRandomShuffle(randomizeSeqs); }
41 
42             //for each sequence (singletons removed on read)
43             for (map<long long, long long>::iterator it = seqBin.begin(); it != seqBin.end(); it++) {
44                 if (it->second == -1) { }
45                 else {
46                     long long numCloseSeqs = (matrix->getNumClose(it->first)); //does not include self
47                     falseNegatives += numCloseSeqs;
48                 }
49             }
50             falseNegatives /= 2; //square matrix
51             trueNegatives = numSeqs * (numSeqs-1)/2 - (falsePositives + falseNegatives + truePositives); //since everyone is a singleton no one clusters together. True negative = num far apart
52         }else {
53 
54             //put everyone in first bin
55             for (int i = 0; i < numSeqs; i++) {
56                 bins[0].push_back(i);
57                 seqBin[i] = 0;
58                 randomizeSeqs.push_back(i);
59             }
60 
61             if (randomize) { util.mothurRandomShuffle(randomizeSeqs); }
62 
63             //for each sequence (singletons removed on read)
64             for (map<long long, long long>::iterator it = seqBin.begin(); it != seqBin.end(); it++) {
65                 if (it->second == -1) { }
66                 else {
67                     long long numCloseSeqs = (matrix->getNumClose(it->first)); //does not include self
68                     truePositives += numCloseSeqs;
69                 }
70             }
71             truePositives /= 2; //square matrix
72             falsePositives = numSeqs * (numSeqs-1)/2 - (trueNegatives + falseNegatives + truePositives);
73         }
74 
75         value = metric->getValue(truePositives, trueNegatives, falsePositives, falseNegatives);
76 
77         return value;
78     }
79     catch(exception& e) {
80         m->errorOut(e, "OptiCluster", "initialize");
81         exit(1);
82     }
83 }
84 /***********************************************************************/
85 /* for each sequence with mutual information (close)
86  * remove from current OTU and calculate MCC when sequence forms its own OTU or joins one of the other OTUs where there is a sequence within the `threshold` (no need to calculate MCC if the paired sequence is already in same OTU and no need to try every OTU - just those where there's a close sequence)
87  * keep or move the sequence to the OTU where the `metric` is the largest - flip a coin on ties */
update(double & listMetric)88 bool OptiCluster::update(double& listMetric) {
89     try {
90 
91         //for each sequence (singletons removed on read)
92         for (int i = 0; i < randomizeSeqs.size(); i++) {
93 
94             if (m->getControl_pressed()) { break; }
95 
96             map<long long, long long>::iterator it = seqBin.find(randomizeSeqs[i]);
97 
98             long long seqNumber = it->first;
99             long long binNumber = it->second;
100 
101             if (binNumber == -1) { }
102             else {
103 
104                 double tn, tp, fp, fn;
105                 double bestMetric = -1;
106                 double bestBin, bestTp, bestTn, bestFn, bestFp;
107                 tn = trueNegatives; tp = truePositives; fp = falsePositives; fn = falseNegatives;
108 
109                 //close / far count in current bin
110                 vector<double> results = getCloseFarCounts(seqNumber, binNumber);
111                 double cCount = results[0];  double fCount = results[1];
112 
113                 //metric in current bin
114                 bestMetric = metric->getValue(tp, tn, fp, fn); bestBin = binNumber; bestTp = tp; bestTn = tn; bestFp = fp; bestFn = fn;
115 
116                 //if not already singleton, then calc value if singleton was created
117                 if (!((bins[binNumber].size()) == 1)) {
118                     //make a singleton
119                     //move out of old bin
120                     fn+=cCount; tn+=fCount; fp-=fCount; tp-=cCount;
121                     double singleMetric = metric->getValue(tp, tn, fp, fn);
122                     if (singleMetric > bestMetric) {
123                         bestBin = -1; bestTp = tp; bestTn = tn; bestFp = fp; bestFn = fn;
124                         bestMetric = singleMetric;
125                     }
126                 }
127 
128                 set<long long> binsToTry;
129                 set<long long> closeSeqs = matrix->getCloseSeqs(seqNumber);
130                 for (set<long long>::iterator itClose = closeSeqs.begin(); itClose != closeSeqs.end(); itClose++) {  binsToTry.insert(seqBin[*itClose]); }
131 
132                 //merge into each "close" otu
133                 for (set<long long>::iterator it = binsToTry.begin(); it != binsToTry.end(); it++) {
134                     tn = trueNegatives; tp = truePositives; fp = falsePositives; fn = falseNegatives;
135                     fn+=cCount; tn+=fCount; fp-=fCount; tp-=cCount; //move out of old bin
136                     results = getCloseFarCounts(seqNumber, *it);
137                     fn-=results[0]; tn-=results[1];  tp+=results[0]; fp+=results[1]; //move into new bin
138                     double newMetric = metric->getValue(tp, tn, fp, fn); //score when sequence is moved
139                     //new best
140                     if (newMetric > bestMetric) { bestMetric = newMetric; bestBin = (*it); bestTp = tp; bestTn = tn; bestFp = fp; bestFn = fn; }
141                 }
142 
143                 bool usedInsert = false;
144                 if (bestBin == -1) {  bestBin = insertLocation;  usedInsert = true;  }
145 
146                 if (bestBin != binNumber) {
147                     truePositives = bestTp; trueNegatives = bestTn; falsePositives = bestFp; falseNegatives = bestFn;
148 
149                     //move seq from i to j
150                     bins[bestBin].push_back(seqNumber); //add seq to bestbin
151                     bins[binNumber].erase(remove(bins[binNumber].begin(), bins[binNumber].end(), seqNumber), bins[binNumber].end()); //remove from old bin i
152                 }
153 
154                 if (usedInsert) { insertLocation = findInsert(); }
155 
156                 //update seqBins
157                 seqBin[seqNumber] = bestBin; //set new OTU location
158             }
159         }
160 
161         listMetric = metric->getValue(truePositives, trueNegatives, falsePositives, falseNegatives);
162 
163         if (m->getDebug()) { ListVector* list = getList(); list->print(cout); delete list; }
164 
165         return 0;
166 
167     }
168     catch(exception& e) {
169         m->errorOut(e, "OptiCluster", "update");
170         exit(1);
171     }
172 }
173 /***********************************************************************/
getCloseFarCounts(long long seq,long long newBin)174 vector<double> OptiCluster::getCloseFarCounts(long long seq, long long newBin) {
175     try {
176         vector<double> results; results.push_back(0); results.push_back(0);
177 
178         if (newBin == -1) { }  //making a singleton bin. Close but we are forcing apart.
179         else { //merging a bin
180             for (int i = 0; i < bins[newBin].size(); i++) {
181                 if (seq == bins[newBin][i]) {} //ignore self
182                 else if (!matrix->isClose(seq, bins[newBin][i])) { results[1]++; }  //this sequence is "far away" from sequence i - above the cutoff
183                 else { results[0]++;  }  //this sequence is "close" to sequence i - distance between them is less than cutoff
184             }
185         }
186 
187         return results;
188     }
189     catch(exception& e) {
190         m->errorOut(e, "OptiCluster", "getCloseFarCounts");
191         exit(1);
192     }
193 }
194 
195 /***********************************************************************/
getStats(double & tp,double & tn,double & fp,double & fn)196 vector<double> OptiCluster::getStats( double& tp,  double& tn,  double& fp,  double& fn) {
197     try {
198         double singletn = matrix->getNumSingletons() + numSingletons;
199         double tempnumSeqs = numSeqs + singletn;
200 
201         tp = truePositives;
202         fp = falsePositives;
203         fn = falseNegatives;
204         tn = tempnumSeqs * (tempnumSeqs-1)/2 - (falsePositives + falseNegatives + truePositives); //adds singletons to tn
205 
206         vector<double> results;
207 
208         Sensitivity sens;   double sensitivity = sens.getValue(tp, tn, fp, fn); results.push_back(sensitivity);
209         Specificity spec;   double specificity = spec.getValue(tp, tn, fp, fn); results.push_back(specificity);
210         PPV ppv;            double positivePredictiveValue = ppv.getValue(tp, tn, fp, fn); results.push_back(positivePredictiveValue);
211         NPV npv;            double negativePredictiveValue = npv.getValue(tp, tn, fp, fn); results.push_back(negativePredictiveValue);
212         FDR fdr;            double falseDiscoveryRate = fdr.getValue(tp, tn, fp, fn); results.push_back(falseDiscoveryRate);
213         Accuracy acc;       double accuracy = acc.getValue(tp, tn, fp, fn); results.push_back(accuracy);
214         MCC mcc;            double matthewsCorrCoef = mcc.getValue(tp, tn, fp, fn); results.push_back(matthewsCorrCoef);
215         F1Score f1;         double f1Score = f1.getValue(tp, tn, fp, fn); results.push_back(f1Score);
216 
217         return results;
218     }
219     catch(exception& e) {
220         m->errorOut(e, "OptiCluster", "getStats");
221         exit(1);
222     }
223 }
224 /***********************************************************************/
getList()225 ListVector* OptiCluster::getList() {
226     try {
227         ListVector* list = new ListVector();
228         ListVector* singleton = matrix->getListSingle();
229 
230         if (singleton != NULL) { //add in any sequences above cutoff in read. Removing these saves clustering time.
231             for (int i = 0; i < singleton->getNumBins(); i++) {
232                 if (singleton->get(i) != "") {
233                     list->push_back(singleton->get(i));
234                 }
235             }
236             delete singleton;
237         }
238 
239         for (int i = 0; i < bins.size(); i++) {
240             if (bins[i].size() != 0) {
241                 string otu = matrix->getName(bins[i][0]);
242 
243                 for (int j = 1; j < bins[i].size(); j++) {
244                     otu += "," + matrix->getName(bins[i][j]);
245                 }
246                 list->push_back(otu);
247             }
248         }
249 
250         return list;
251     }
252     catch(exception& e) {
253         m->errorOut(e, "OptiCluster", "getList");
254         exit(1);
255     }
256 }
257 /***********************************************************************/
getNumBins()258 long long OptiCluster::getNumBins() {
259     try {
260         long long singletn = matrix->getNumSingletons();
261 
262         for (int i = 0; i < bins.size(); i++) {
263             if (bins[i].size() != 0) {
264                 singletn++;
265             }
266         }
267 
268         return singletn;
269     }
270     catch(exception& e) {
271         m->errorOut(e, "OptiCluster", "getNumBins");
272         exit(1);
273     }
274 }
275 
276 /***********************************************************************/
findInsert()277 long long OptiCluster::findInsert() {
278     try {
279 
280         //initially there are bins for each sequence (excluding singletons removed on read)
281         for (long long i = 0; i < bins.size(); i++) {
282 
283             if (m->getControl_pressed()) { break; }
284 
285             if (bins[i].size() == 0) { return i;  } //this bin is empty
286         }
287 
288         return -1;
289     }
290     catch(exception& e) {
291         m->errorOut(e, "OptiCluster", "findInsert");
292         exit(1);
293     }
294 }
295 
296 /***********************************************************************/
297