1 //
2 // calculator.cpp
3 // Mothur
4 //
5 // Created by Sarah Westcott on 4/21/20.
6 // Copyright © 2020 Schloss Lab. All rights reserved.
7 //
8
9 #include "calculator.h"
10
11 /***********************************************************************/
setStart(string seqA,string seqB)12 int DistCalc::setStart(string seqA, string seqB) {
13 try {
14 int start = 0;
15 int alignLength = seqA.length();
16
17 for(int i=0;i<alignLength;i++){
18 if((seqA[i] != '.' || seqB[i] != '.')){ //one of you is not a terminal gap
19 start = i;
20 break;
21 }
22 }
23
24 return start;
25 }
26 catch(exception& e) {
27 m->errorOut(e, "DistCalc", "setStart");
28 exit(1);
29 }
30 }
31 /***********************************************************************/
setEnd(string seqA,string seqB)32 int DistCalc::setEnd(string seqA, string seqB) {
33 try {
34 int end = 0;
35 int alignLength = seqA.length();
36
37 for(int i=alignLength-1;i>=0;i--){
38 if((seqA[i] != '.' || seqB[i] != '.')){ //one of you is not a terminal gap
39 end = i;
40 break;
41 }
42 }
43 return end;
44 }
45 catch(exception& e) {
46 m->errorOut(e, "DistCalc", "setEnd");
47 exit(1);
48 }
49 }
50 /***********************************************************************/
51 // this assumes that sequences start and end with '.'s instead of'-'s.
setStartIgnoreTermGap(string seqA,string seqB,bool & overlap)52 int DistCalc::setStartIgnoreTermGap(string seqA, string seqB, bool& overlap) {
53 try {
54
55 int start = 0;
56 int alignLength = seqA.length();
57
58 for(int i=0;i<alignLength;i++){
59 if(seqA[i] != '.' && seqB[i] != '.' && seqA[i] != '-' && seqB[i] != '-' ){ //skip leading gaps
60 start = i;
61
62 overlap = true;
63 break;
64 }
65 }
66
67 return start;
68 }
69 catch(exception& e) {
70 m->errorOut(e, "DistCalc", "setStartIgnoreTermGap");
71 exit(1);
72 }
73 }
74 /***********************************************************************/
75 // this assumes that sequences start and end with '.'s instead of'-'s.
setEndIgnoreTermGap(string seqA,string seqB,bool & overlap)76 int DistCalc::setEndIgnoreTermGap(string seqA, string seqB, bool& overlap) {
77 try {
78
79 int end = 0;
80 int alignLength = seqA.length();
81
82 for(int i=alignLength-1;i>=0;i--){
83 if(seqA[i] != '.' && seqB[i] != '.' && seqA[i] != '-' && seqB[i] != '-' ){ //ignore terminal gaps
84 end = i;
85
86 overlap = true;
87 break;
88 }
89 }
90
91 return end;
92 }
93 catch(exception& e) {
94 m->errorOut(e, "DistCalc", "setEndIgnoreTermGap");
95 exit(1);
96 }
97 }
98 /***********************************************************************/
99
setStartsIgnoreTermGap(classifierOTU seqA,classifierOTU otu,vector<int> cols)100 vector<int> DistCalc::setStartsIgnoreTermGap(classifierOTU seqA, classifierOTU otu, vector<int> cols){
101 try {
102 vector<int> starts; starts.resize(otu.numSeqs, -1);
103
104 int alignLength = cols.size();
105
106 int seqAStart = 0;
107 for(int i=0;i<alignLength;i++){ //for each column we want to include
108 if ((seqA.otuData[cols[i]][0] != '.') && (seqA.otuData[cols[i]][0] != '-')){
109 seqAStart = i; break;
110 }
111 }
112
113 //set start positions
114 int numset = 0;
115 for(int i=seqAStart;i<alignLength;i++){ //start can't be before seqAStart because of the &&
116
117 if(numset == otu.numSeqs) { break; }
118
119 vector<char> thisColumn = otu.otuData[cols[i]];
120 if (thisColumn.size() != otu.numSeqs) { //all seqs at this spot are identical
121
122 char thisChar = thisColumn[0];
123
124 if ((thisChar == '.') || (thisChar == '-')) { } //every seq in otu is a '.' or '-' at this location, move to next column
125 else { //this is a base in all locations, you are done
126 for (int k = 0; k < starts.size(); k++) {
127 if ((starts[k] == -1) && (seqA.otuData[cols[i]][0] != '.') && (seqA.otuData[cols[i]][0] != '-')) { starts[k] = i; numset++; } //any unset starts are set to this location
128 }
129 break;
130 }
131 }else{
132 for(int j=0;j<otu.numSeqs;j++){ //for each reference
133 if((thisColumn[j] != '.') && (thisColumn[j] != '-') && (starts[j] == -1) && (seqA.otuData[cols[i]][0] != '.') && (seqA.otuData[cols[i]][0] != '-')){ //seq j hasn't set the start value and its a base
134 starts[j] = i; numset++;
135 }
136 }
137 }
138 }
139
140 return starts;
141 }
142 catch(exception& e) {
143 m->errorOut(e, "DistCalc", "setStartsIgnoreTermGap");
144 exit(1);
145 }
146 }
147 /***********************************************************************/
148
setEndsIgnoreTermGap(classifierOTU seqA,classifierOTU otu,vector<int> cols)149 vector<int> DistCalc::setEndsIgnoreTermGap(classifierOTU seqA, classifierOTU otu, vector<int> cols){
150 try {
151 vector<int> ends; ends.resize(otu.numSeqs, -1);
152
153 int alignLength = cols.size();
154
155 int seqAEnd = 0;
156 for(int i=alignLength-1;i>=0;i--){//for each column we want to include
157 if ((seqA.otuData[cols[i]][0] != '.') && (seqA.otuData[cols[i]][0] != '-')) {
158 seqAEnd = i; break;
159 }
160 }
161
162 //set start positions
163 int numset = 0;
164 for(int i=seqAEnd;i>=0;i--){ //for each column we want to include
165
166 if(numset == otu.numSeqs) { break; }
167
168 vector<char> thisColumn = otu.otuData[cols[i]];
169 if (thisColumn.size() != otu.numSeqs) { //all seqs at this spot are identical
170
171 char thisChar = thisColumn[0];
172
173 if ((thisChar == '.') || (thisChar == '-')){ } //every seq in otu is a '.' at this location, move to next column
174 else { //this is a base in all locations, you are done
175 for (int k = 0; k < ends.size(); k++) {
176 if ((ends[k] == -1) && (seqA.otuData[cols[i]][0] != '.') && (seqA.otuData[cols[i]][0] != '-')){ ends[k] = i; numset++; } //any unset starts are set to this location
177 }
178 break;
179 }
180 }else{
181 for(int j=0;j<otu.numSeqs;j++){ //for each reference
182 if((thisColumn[j] != '.') && (thisColumn[j] != '-') && (ends[j] == -1) && (seqA.otuData[cols[i]][0] != '.') && (seqA.otuData[cols[i]][0] != '-')) { //seq j hasn't set the start value and its a base
183 ends[j] = i; numset++;
184 }
185 }
186 }
187 }
188
189 return ends;
190 }
191 catch(exception& e) {
192 m->errorOut(e, "DistCalc", "setEndsIgnoreTermGap");
193 exit(1);
194 }
195 }
196 /***********************************************************************/
197
setStarts(classifierOTU seqA,classifierOTU otu,vector<int> cols)198 vector<int> DistCalc::setStarts(classifierOTU seqA, classifierOTU otu, vector<int> cols){
199 try {
200 vector<int> starts; starts.resize(otu.numSeqs, 0);
201
202 int alignLength = cols.size();
203
204 int seqAStart = 0;
205 for(int i=0;i<alignLength;i++){ //for each column we want to include
206 if (seqA.otuData[cols[i]][0] != '.') {
207 seqAStart = i; break;
208 }
209 }
210
211 //set start positions
212 int numset = 0;
213 for(int i=0;i<alignLength;i++){ //for each column we want to include
214
215 if (seqAStart <= i) { //our query seq starts before this point so set rest of unset starts to query start
216 for (int k = 0; k < starts.size(); k++) {
217 if (starts[k] == 0) { starts[k] = seqAStart; numset++; }
218 }
219 break;
220 }else if(numset == otu.numSeqs) { break; }
221
222 vector<char> thisColumn = otu.otuData[cols[i]];
223 if (thisColumn.size() != otu.numSeqs) { //all seqs at this spot are identical
224
225 char thisChar = thisColumn[0];
226
227 if (thisChar == '.') { } //every seq in otu is a '.' at this location, move to next column
228 else { //this is a base in all locations, you are done
229 for (int k = 0; k < starts.size(); k++) {
230 if (starts[k] == 0) { starts[k] = i; numset++; } //any unset starts are set to this location
231 }
232 break;
233 }
234 }else{
235 for(int j=0;j<otu.numSeqs;j++){ //for each reference
236 if((thisColumn[j] != '.') && (starts[j] == 0)){ //seq j hasn't set the start value and its a base
237 starts[j] = i; numset++;
238 }
239 }
240 }
241 }
242
243 return starts;
244 }
245 catch(exception& e) {
246 m->errorOut(e, "DistCalc", "setStarts");
247 exit(1);
248 }
249 }
250 /***********************************************************************/
251
setEnds(classifierOTU seqA,classifierOTU otu,vector<int> cols)252 vector<int> DistCalc::setEnds(classifierOTU seqA, classifierOTU otu, vector<int> cols){
253 try {
254 vector<int> ends; ends.resize(otu.numSeqs, 0);
255
256 int alignLength = cols.size();
257
258 int seqAEnd = 0;
259 for(int i=alignLength-1;i>=0;i--){//for each column we want to include
260 if (seqA.otuData[cols[i]][0] != '.') {
261 seqAEnd = i; break;
262 }
263 }
264
265 //set start positions
266 int numset = 0;
267 for(int i=alignLength-1;i>=0;i--){ //for each column we want to include
268
269 if (seqAEnd <= i) { //our query seq starts before this point so set rest of unset starts to query start
270 for (int k = 0; k < ends.size(); k++) {
271 if (ends[k] == 0) { ends[k] = seqAEnd; numset++; }
272 }
273 break;
274 }else if(numset == otu.numSeqs) { break; }
275
276 vector<char> thisColumn = otu.otuData[cols[i]];
277 if (thisColumn.size() != otu.numSeqs) { //all seqs at this spot are identical
278
279 char thisChar = thisColumn[0];
280
281 if (thisChar == '.') { } //every seq in otu is a '.' at this location, move to next column
282 else { //this is a base in all locations, you are done
283 for (int k = 0; k < ends.size(); k++) {
284 if (ends[k] == 0) { ends[k] = i; numset++; } //any unset starts are set to this location
285 }
286 break;
287 }
288 }else{
289 for(int j=0;j<otu.numSeqs;j++){ //for each reference
290 if((thisColumn[j] != '.') && (ends[j] == 0)){ //seq j hasn't set the start value and its a base
291 ends[j] = i; numset++;
292 }
293 }
294 }
295 }
296
297 return ends;
298 }
299 catch(exception& e) {
300 m->errorOut(e, "DistCalc", "setEnds");
301 exit(1);
302 }
303 }
304
305 /***********************************************************************/
306 //nb1 and nb2 have size 1, unless amino acid = B or Z
predict(vector<int> nb1,vector<int> nb2,double & p,double & dp,double & d2p,double & tt,double eigs[20],double probs[20][20])307 void DistCalc::predict(vector<int> nb1, vector<int> nb2, double& p, double& dp, double& d2p, double& tt, double eigs[20], double probs[20][20]){
308 try {
309 double q;
310
311 for (int i = 0; i < nb1.size(); i++) {
312
313 for (int l = 0; l < 20; l++) {
314
315 double elambdat = exp(tt * eigs[l]);
316
317 // printf("l = %ld, nb1 = %ld, nb2 = %ld\n", l, nb1[i], nb2[i]);
318 // printf("l = %ld, eig[m] = %f, prob[m][nb1 - 1] = %f, prob[m][nb2 - 1] = %f\n", l, eigs[l], probs[l][nb1[i] - 1], probs[l][nb2[i] - 1]);
319
320
321 q = probs[l][nb1[i]-1] * probs[l][nb2[i]-1] * elambdat;
322 p += q;
323
324 dp += eigs[l] * q;
325
326 double TEMP = eigs[l];
327
328 d2p += TEMP * TEMP * q;
329 }
330 }
331
332 //printf("p = %f, q = %f, tt = %f\n", p, q, tt);
333 // printf("dp = %f, d2p = %f\n", dp, d2p);
334 }
335 catch(exception& e) {
336 m->errorOut(e, "DistCalc", "predict");
337 exit(1);
338 }
339 }
340 /***********************************************************************/
341 //nb1 and nb2 have size 1, unless amino acid = B or Z
fillNums(vector<int> & numAs,vector<int> & numBs,int numA,int numB)342 void DistCalc::fillNums(vector<int>& numAs, vector<int>& numBs, int numA, int numB){
343 try {
344
345 if (numA == asx) { //B asn or asp (3 or 4)
346 if (numB == asx) { //B asn or asp
347 numAs.push_back(3); numBs.push_back(3); //asn, asn
348 numAs.push_back(3); numBs.push_back(4); //asn, asp
349 numAs.push_back(4); numBs.push_back(3); //asp, asn
350 numAs.push_back(4); numBs.push_back(4); //asp, asp
351 }else {
352 if (numB == glx) { //Z gln or glu (6 or 7)
353 numAs.push_back(3); numBs.push_back(6); //asn, gln
354 numAs.push_back(3); numBs.push_back(7); //asn, glu
355 numAs.push_back(4); numBs.push_back(6); //asp, gln
356 numAs.push_back(4); numBs.push_back(7); //asp, glu
357 }else {
358 if (numB < ser2) { numB++; }
359 numAs.push_back(3); numBs.push_back(numB); //asn, numB
360 numAs.push_back(4); numBs.push_back(numB); //asp, numB
361 }
362 }
363 }else {
364 if (numA == glx) { //Z gln or glu (6 or 7)
365 if (numB == asx) { //B asn or asp
366 numAs.push_back(6); numBs.push_back(3); //gln, asn
367 numAs.push_back(6); numBs.push_back(4); //gln, asp
368 numAs.push_back(7); numBs.push_back(3); //glu, asn
369 numAs.push_back(7); numBs.push_back(4); //glu, asp
370 }else {
371 if (numB == glx) { //Z gln or glu (6 or 7)
372 numAs.push_back(6); numBs.push_back(6); //gln, gln
373 numAs.push_back(6); numBs.push_back(7); //gln, glu
374 numAs.push_back(7); numBs.push_back(6); //glu, gln
375 numAs.push_back(7); numBs.push_back(7); //glu, glu
376 }else {
377 if (numB < ser2) { numB++; }
378 numAs.push_back(6); numBs.push_back(numB); //gln, numB
379 numAs.push_back(7); numBs.push_back(numB); //glu, numB
380 }
381 }
382 }else {
383 if (numA < ser2) { numA++; }
384 if (numB == asx) { //B asn or asp
385 numAs.push_back(numA); numBs.push_back(3); //numA, asn
386 numAs.push_back(numA); numBs.push_back(4); //numA, asp
387 numAs.push_back(numA); numBs.push_back(3); //numA, asn
388 numAs.push_back(numA); numBs.push_back(4); //numA, asp
389 }else if (numB == glx) { //Z gln or glu (6 or 7)
390 numAs.push_back(numA); numBs.push_back(6); //numA, gln
391 numAs.push_back(numA); numBs.push_back(7); //numA, glu
392 numAs.push_back(numA); numBs.push_back(6); //numA, gln
393 numAs.push_back(numA); numBs.push_back(7); //numA, glu
394 }
395 }
396 }
397 }
398 catch(exception& e) {
399 m->errorOut(e, "DistCalc", "fillNums");
400 exit(1);
401 }
402 }
403 /***********************************************************************/
makeDists(Protein A,Protein B,double eigs[20],double probs[20][20])404 double DistCalc::makeDists(Protein A, Protein B, double eigs[20], double probs[20][20]){
405 try {
406 int numBases = A.getAlignLength();
407 vector<AminoAcid> seqA = A.getAligned();
408 vector<AminoAcid> seqB = B.getAligned();
409
410 bool inf = false; bool neginfinity = false; bool overlap = false;
411 double delta, lnlike, slope, curv, tt;
412 tt = 0.1; delta = tt / 2.0;
413
414 for (int l = 0; l < 20; l++) {
415
416 //reset for this attempt
417 lnlike = 0.0; slope = 0.0; curv = 0.0; neginfinity = false; overlap = false;
418 double oldweight = 1.0;
419
420 if (m->getControl_pressed()) { break; }
421
422 for (int i = 0; i < numBases; i++) {
423 int numA = seqA[i].getNum();
424 int numB = seqB[i].getNum();
425
426 if (numA != stop && numA != del && numA != quest && numA != unk &&
427 numB != stop && numB != del && numB != quest && numB != unk) {
428
429 double p = 0.0; double dp = 0.0; double d2p = 0.0; overlap = true;
430
431 vector<int> numAs; vector<int> numBs;
432 if (numA != asx && numA != glx && numB != asx && numB != glx) {
433 if (numA < ser2) { numA++; }
434 if (numB < ser2) { numB++; }
435 numAs.push_back(numA); numBs.push_back(numB); //+1 avoid 0
436 }else {
437 fillNums(numAs, numBs, numA, numB);
438 }
439
440 predict(numAs, numBs, p, dp, d2p, tt, eigs, probs);
441
442 if (p <= 0.0) {
443 neginfinity = true;
444 }else {
445 slope += oldweight*dp / p;
446 curv += oldweight*(d2p / p - dp * dp / (p * p));
447
448 //printf("%ld:%ld, dp = %f, p = %f, d2p = %f\n", l, i, dp, p, d2p);
449
450 //printf("%ld:%ld, slope = %f, curv = %f, oldweight[i] = %ld\n", l, i, slope, curv, oldweight);
451 }
452 }//endif stop
453 }//endif bases
454
455 if (!overlap) {
456 tt = -1.0; l += 20; inf = true;
457 }else if (!neginfinity) {
458 if (curv < 0.0) {
459 tt -= slope / curv;
460 if (tt > 10000.0) { tt = -1.0; l += 20; inf = true; }
461 }else {
462 if ((slope > 0.0 && delta < 0.0) || (slope < 0.0 && delta > 0.0)) { delta /= -2; }
463 tt += delta;
464 }
465 }else {
466 delta /= -2;
467 tt += delta;
468 }
469
470 if (tt < 0.00001 && !inf) { tt = 0.00001; }
471
472 }//endif attempts
473
474 dist = tt;
475 //exit(1);
476
477 return dist;
478 }
479 catch(exception& e) {
480 m->errorOut(e, "DistCalc", "makeDists");
481 exit(1);
482 }
483 }
484 /***********************************************************************/
485