1 //
2 // opticluster.cpp
3 // Mothur
4 //
5 // Created by Sarah Westcott on 4/20/16.
6 // Copyright (c) 2016 Schloss Lab. All rights reserved.
7 //
8
9 #include "opticluster.h"
10
11 /***********************************************************************/
12 //randomly assign sequences to OTUs
initialize(double & value,bool randomize,string initialize)13 int OptiCluster::initialize(double& value, bool randomize, string initialize) {
14 try {
15 numSeqs = matrix->getNumSeqs();
16 truePositives = 0;
17 falsePositives = 0;
18 falseNegatives = 0;
19 trueNegatives = 0;
20
21 bins.resize(numSeqs); //place seqs in own bin
22
23 vector<long long> temp;
24 bins.push_back(temp);
25 seqBin[numSeqs] = -1;
26 insertLocation = numSeqs;
27 Utils util;
28
29 if (initialize == "singleton") {
30
31 //put everyone in own bin
32 for (int i = 0; i < numSeqs; i++) { bins[i].push_back(i); }
33
34 //maps randomized sequences to bins
35 for (int i = 0; i < numSeqs; i++) {
36 seqBin[i] = bins[i][0];
37 randomizeSeqs.push_back(i);
38 }
39
40 if (randomize) { util.mothurRandomShuffle(randomizeSeqs); }
41
42 //for each sequence (singletons removed on read)
43 for (map<long long, long long>::iterator it = seqBin.begin(); it != seqBin.end(); it++) {
44 if (it->second == -1) { }
45 else {
46 long long numCloseSeqs = (matrix->getNumClose(it->first)); //does not include self
47 falseNegatives += numCloseSeqs;
48 }
49 }
50 falseNegatives /= 2; //square matrix
51 trueNegatives = numSeqs * (numSeqs-1)/2 - (falsePositives + falseNegatives + truePositives); //since everyone is a singleton no one clusters together. True negative = num far apart
52 }else {
53
54 //put everyone in first bin
55 for (int i = 0; i < numSeqs; i++) {
56 bins[0].push_back(i);
57 seqBin[i] = 0;
58 randomizeSeqs.push_back(i);
59 }
60
61 if (randomize) { util.mothurRandomShuffle(randomizeSeqs); }
62
63 //for each sequence (singletons removed on read)
64 for (map<long long, long long>::iterator it = seqBin.begin(); it != seqBin.end(); it++) {
65 if (it->second == -1) { }
66 else {
67 long long numCloseSeqs = (matrix->getNumClose(it->first)); //does not include self
68 truePositives += numCloseSeqs;
69 }
70 }
71 truePositives /= 2; //square matrix
72 falsePositives = numSeqs * (numSeqs-1)/2 - (trueNegatives + falseNegatives + truePositives);
73 }
74
75 value = metric->getValue(truePositives, trueNegatives, falsePositives, falseNegatives);
76
77 return value;
78 }
79 catch(exception& e) {
80 m->errorOut(e, "OptiCluster", "initialize");
81 exit(1);
82 }
83 }
84 /***********************************************************************/
85 /* for each sequence with mutual information (close)
86 * remove from current OTU and calculate MCC when sequence forms its own OTU or joins one of the other OTUs where there is a sequence within the `threshold` (no need to calculate MCC if the paired sequence is already in same OTU and no need to try every OTU - just those where there's a close sequence)
87 * keep or move the sequence to the OTU where the `metric` is the largest - flip a coin on ties */
update(double & listMetric)88 bool OptiCluster::update(double& listMetric) {
89 try {
90
91 //for each sequence (singletons removed on read)
92 for (int i = 0; i < randomizeSeqs.size(); i++) {
93
94 if (m->getControl_pressed()) { break; }
95
96 map<long long, long long>::iterator it = seqBin.find(randomizeSeqs[i]);
97
98 long long seqNumber = it->first;
99 long long binNumber = it->second;
100
101 if (binNumber == -1) { }
102 else {
103
104 double tn, tp, fp, fn;
105 double bestMetric = -1;
106 double bestBin, bestTp, bestTn, bestFn, bestFp;
107 tn = trueNegatives; tp = truePositives; fp = falsePositives; fn = falseNegatives;
108
109 //close / far count in current bin
110 vector<double> results = getCloseFarCounts(seqNumber, binNumber);
111 double cCount = results[0]; double fCount = results[1];
112
113 //metric in current bin
114 bestMetric = metric->getValue(tp, tn, fp, fn); bestBin = binNumber; bestTp = tp; bestTn = tn; bestFp = fp; bestFn = fn;
115
116 //if not already singleton, then calc value if singleton was created
117 if (!((bins[binNumber].size()) == 1)) {
118 //make a singleton
119 //move out of old bin
120 fn+=cCount; tn+=fCount; fp-=fCount; tp-=cCount;
121 double singleMetric = metric->getValue(tp, tn, fp, fn);
122 if (singleMetric > bestMetric) {
123 bestBin = -1; bestTp = tp; bestTn = tn; bestFp = fp; bestFn = fn;
124 bestMetric = singleMetric;
125 }
126 }
127
128 set<long long> binsToTry;
129 set<long long> closeSeqs = matrix->getCloseSeqs(seqNumber);
130 for (set<long long>::iterator itClose = closeSeqs.begin(); itClose != closeSeqs.end(); itClose++) { binsToTry.insert(seqBin[*itClose]); }
131
132 //merge into each "close" otu
133 for (set<long long>::iterator it = binsToTry.begin(); it != binsToTry.end(); it++) {
134 tn = trueNegatives; tp = truePositives; fp = falsePositives; fn = falseNegatives;
135 fn+=cCount; tn+=fCount; fp-=fCount; tp-=cCount; //move out of old bin
136 results = getCloseFarCounts(seqNumber, *it);
137 fn-=results[0]; tn-=results[1]; tp+=results[0]; fp+=results[1]; //move into new bin
138 double newMetric = metric->getValue(tp, tn, fp, fn); //score when sequence is moved
139 //new best
140 if (newMetric > bestMetric) { bestMetric = newMetric; bestBin = (*it); bestTp = tp; bestTn = tn; bestFp = fp; bestFn = fn; }
141 }
142
143 bool usedInsert = false;
144 if (bestBin == -1) { bestBin = insertLocation; usedInsert = true; }
145
146 if (bestBin != binNumber) {
147 truePositives = bestTp; trueNegatives = bestTn; falsePositives = bestFp; falseNegatives = bestFn;
148
149 //move seq from i to j
150 bins[bestBin].push_back(seqNumber); //add seq to bestbin
151 bins[binNumber].erase(remove(bins[binNumber].begin(), bins[binNumber].end(), seqNumber), bins[binNumber].end()); //remove from old bin i
152 }
153
154 if (usedInsert) { insertLocation = findInsert(); }
155
156 //update seqBins
157 seqBin[seqNumber] = bestBin; //set new OTU location
158 }
159 }
160
161 listMetric = metric->getValue(truePositives, trueNegatives, falsePositives, falseNegatives);
162
163 if (m->getDebug()) { ListVector* list = getList(); list->print(cout); delete list; }
164
165 return 0;
166
167 }
168 catch(exception& e) {
169 m->errorOut(e, "OptiCluster", "update");
170 exit(1);
171 }
172 }
173 /***********************************************************************/
getCloseFarCounts(long long seq,long long newBin)174 vector<double> OptiCluster::getCloseFarCounts(long long seq, long long newBin) {
175 try {
176 vector<double> results; results.push_back(0); results.push_back(0);
177
178 if (newBin == -1) { } //making a singleton bin. Close but we are forcing apart.
179 else { //merging a bin
180 for (int i = 0; i < bins[newBin].size(); i++) {
181 if (seq == bins[newBin][i]) {} //ignore self
182 else if (!matrix->isClose(seq, bins[newBin][i])) { results[1]++; } //this sequence is "far away" from sequence i - above the cutoff
183 else { results[0]++; } //this sequence is "close" to sequence i - distance between them is less than cutoff
184 }
185 }
186
187 return results;
188 }
189 catch(exception& e) {
190 m->errorOut(e, "OptiCluster", "getCloseFarCounts");
191 exit(1);
192 }
193 }
194
195 /***********************************************************************/
getStats(double & tp,double & tn,double & fp,double & fn)196 vector<double> OptiCluster::getStats( double& tp, double& tn, double& fp, double& fn) {
197 try {
198 double singletn = matrix->getNumSingletons() + numSingletons;
199 double tempnumSeqs = numSeqs + singletn;
200
201 tp = truePositives;
202 fp = falsePositives;
203 fn = falseNegatives;
204 tn = tempnumSeqs * (tempnumSeqs-1)/2 - (falsePositives + falseNegatives + truePositives); //adds singletons to tn
205
206 vector<double> results;
207
208 Sensitivity sens; double sensitivity = sens.getValue(tp, tn, fp, fn); results.push_back(sensitivity);
209 Specificity spec; double specificity = spec.getValue(tp, tn, fp, fn); results.push_back(specificity);
210 PPV ppv; double positivePredictiveValue = ppv.getValue(tp, tn, fp, fn); results.push_back(positivePredictiveValue);
211 NPV npv; double negativePredictiveValue = npv.getValue(tp, tn, fp, fn); results.push_back(negativePredictiveValue);
212 FDR fdr; double falseDiscoveryRate = fdr.getValue(tp, tn, fp, fn); results.push_back(falseDiscoveryRate);
213 Accuracy acc; double accuracy = acc.getValue(tp, tn, fp, fn); results.push_back(accuracy);
214 MCC mcc; double matthewsCorrCoef = mcc.getValue(tp, tn, fp, fn); results.push_back(matthewsCorrCoef);
215 F1Score f1; double f1Score = f1.getValue(tp, tn, fp, fn); results.push_back(f1Score);
216
217 return results;
218 }
219 catch(exception& e) {
220 m->errorOut(e, "OptiCluster", "getStats");
221 exit(1);
222 }
223 }
224 /***********************************************************************/
getList()225 ListVector* OptiCluster::getList() {
226 try {
227 ListVector* list = new ListVector();
228 ListVector* singleton = matrix->getListSingle();
229
230 if (singleton != NULL) { //add in any sequences above cutoff in read. Removing these saves clustering time.
231 for (int i = 0; i < singleton->getNumBins(); i++) {
232 if (singleton->get(i) != "") {
233 list->push_back(singleton->get(i));
234 }
235 }
236 delete singleton;
237 }
238
239 for (int i = 0; i < bins.size(); i++) {
240 if (bins[i].size() != 0) {
241 string otu = matrix->getName(bins[i][0]);
242
243 for (int j = 1; j < bins[i].size(); j++) {
244 otu += "," + matrix->getName(bins[i][j]);
245 }
246 list->push_back(otu);
247 }
248 }
249
250 return list;
251 }
252 catch(exception& e) {
253 m->errorOut(e, "OptiCluster", "getList");
254 exit(1);
255 }
256 }
257 /***********************************************************************/
getNumBins()258 long long OptiCluster::getNumBins() {
259 try {
260 long long singletn = matrix->getNumSingletons();
261
262 for (int i = 0; i < bins.size(); i++) {
263 if (bins[i].size() != 0) {
264 singletn++;
265 }
266 }
267
268 return singletn;
269 }
270 catch(exception& e) {
271 m->errorOut(e, "OptiCluster", "getNumBins");
272 exit(1);
273 }
274 }
275
276 /***********************************************************************/
findInsert()277 long long OptiCluster::findInsert() {
278 try {
279
280 //initially there are bins for each sequence (excluding singletons removed on read)
281 for (long long i = 0; i < bins.size(); i++) {
282
283 if (m->getControl_pressed()) { break; }
284
285 if (bins[i].size() == 0) { return i; } //this bin is empty
286 }
287
288 return -1;
289 }
290 catch(exception& e) {
291 m->errorOut(e, "OptiCluster", "findInsert");
292 exit(1);
293 }
294 }
295
296 /***********************************************************************/
297