1 //
2 //  calculator.cpp
3 //  Mothur
4 //
5 //  Created by Sarah Westcott on 4/21/20.
6 //  Copyright © 2020 Schloss Lab. All rights reserved.
7 //
8 
9 #include "calculator.h"
10 
11 /***********************************************************************/
setStart(string seqA,string seqB)12 int DistCalc::setStart(string seqA, string seqB) {
13     try {
14         int start = 0;
15         int alignLength = seqA.length();
16 
17         for(int i=0;i<alignLength;i++){
18             if((seqA[i] != '.' || seqB[i] != '.')){ //one of you is not a terminal gap
19                 start = i;
20                 break;
21             }
22         }
23 
24         return start;
25     }
26     catch(exception& e) {
27         m->errorOut(e, "DistCalc", "setStart");
28         exit(1);
29     }
30 }
31 /***********************************************************************/
setEnd(string seqA,string seqB)32 int DistCalc::setEnd(string seqA, string seqB) {
33     try {
34         int end = 0;
35         int alignLength = seqA.length();
36 
37         for(int i=alignLength-1;i>=0;i--){
38             if((seqA[i] != '.' || seqB[i] != '.')){ //one of you is not a terminal gap
39                 end = i;
40                 break;
41             }
42         }
43         return end;
44     }
45     catch(exception& e) {
46         m->errorOut(e, "DistCalc", "setEnd");
47         exit(1);
48     }
49 }
50 /***********************************************************************/
51 // this assumes that sequences start and end with '.'s instead of'-'s.
setStartIgnoreTermGap(string seqA,string seqB,bool & overlap)52 int DistCalc::setStartIgnoreTermGap(string seqA, string seqB, bool& overlap) {
53     try {
54 
55         int start = 0;
56         int alignLength = seqA.length();
57 
58         for(int i=0;i<alignLength;i++){
59             if(seqA[i] != '.' && seqB[i] != '.' && seqA[i] != '-' && seqB[i] != '-' ){ //skip leading gaps
60                 start = i;
61 
62                 overlap = true;
63                 break;
64             }
65         }
66 
67         return start;
68     }
69     catch(exception& e) {
70         m->errorOut(e, "DistCalc", "setStartIgnoreTermGap");
71         exit(1);
72     }
73 }
74 /***********************************************************************/
75 // this assumes that sequences start and end with '.'s instead of'-'s.
setEndIgnoreTermGap(string seqA,string seqB,bool & overlap)76 int DistCalc::setEndIgnoreTermGap(string seqA, string seqB, bool& overlap) {
77     try {
78 
79         int end = 0;
80         int alignLength = seqA.length();
81 
82         for(int i=alignLength-1;i>=0;i--){
83             if(seqA[i] != '.' && seqB[i] != '.' && seqA[i] != '-' && seqB[i] != '-' ){ //ignore terminal gaps
84                 end = i;
85 
86                 overlap = true;
87                 break;
88             }
89         }
90 
91         return end;
92     }
93     catch(exception& e) {
94         m->errorOut(e, "DistCalc", "setEndIgnoreTermGap");
95         exit(1);
96     }
97 }
98 /***********************************************************************/
99 
setStartsIgnoreTermGap(classifierOTU seqA,classifierOTU otu,vector<int> cols)100 vector<int> DistCalc::setStartsIgnoreTermGap(classifierOTU seqA, classifierOTU otu, vector<int> cols){
101     try {
102         vector<int> starts; starts.resize(otu.numSeqs, -1);
103 
104         int alignLength = cols.size();
105 
106         int seqAStart = 0;
107         for(int i=0;i<alignLength;i++){ //for each column we want to include
108             if ((seqA.otuData[cols[i]][0] != '.') && (seqA.otuData[cols[i]][0] != '-')){
109                 seqAStart = i; break;
110             }
111         }
112 
113         //set start positions
114         int numset = 0;
115         for(int i=seqAStart;i<alignLength;i++){ //start can't be before seqAStart because of the &&
116 
117             if(numset == otu.numSeqs) { break; }
118 
119             vector<char> thisColumn = otu.otuData[cols[i]];
120             if (thisColumn.size() != otu.numSeqs) { //all seqs at this spot are identical
121 
122                 char thisChar = thisColumn[0];
123 
124                 if ((thisChar == '.') || (thisChar == '-')) { } //every seq in otu is a '.' or '-' at this location, move to next column
125                 else { //this is a base in all locations, you are done
126                     for (int k = 0; k < starts.size(); k++) {
127                         if ((starts[k] == -1) && (seqA.otuData[cols[i]][0] != '.') && (seqA.otuData[cols[i]][0] != '-')) { starts[k] = i; numset++; } //any unset starts are set to this location
128                     }
129                     break;
130                 }
131             }else{
132                 for(int j=0;j<otu.numSeqs;j++){ //for each reference
133                     if((thisColumn[j] != '.') && (thisColumn[j] != '-') && (starts[j] == -1) && (seqA.otuData[cols[i]][0] != '.') && (seqA.otuData[cols[i]][0] != '-')){ //seq j hasn't set the start value and its a base
134                         starts[j] = i; numset++;
135                     }
136                 }
137             }
138         }
139 
140         return starts;
141     }
142     catch(exception& e) {
143         m->errorOut(e, "DistCalc", "setStartsIgnoreTermGap");
144         exit(1);
145     }
146 }
147 /***********************************************************************/
148 
setEndsIgnoreTermGap(classifierOTU seqA,classifierOTU otu,vector<int> cols)149 vector<int> DistCalc::setEndsIgnoreTermGap(classifierOTU seqA, classifierOTU otu, vector<int> cols){
150     try {
151         vector<int> ends; ends.resize(otu.numSeqs, -1);
152 
153         int alignLength = cols.size();
154 
155         int seqAEnd = 0;
156         for(int i=alignLength-1;i>=0;i--){//for each column we want to include
157             if ((seqA.otuData[cols[i]][0] != '.') && (seqA.otuData[cols[i]][0] != '-')) {
158                 seqAEnd = i; break;
159             }
160         }
161 
162         //set start positions
163         int numset = 0;
164         for(int i=seqAEnd;i>=0;i--){ //for each column we want to include
165 
166             if(numset == otu.numSeqs) { break; }
167 
168             vector<char> thisColumn = otu.otuData[cols[i]];
169             if (thisColumn.size() != otu.numSeqs) { //all seqs at this spot are identical
170 
171                 char thisChar = thisColumn[0];
172 
173                 if ((thisChar == '.') || (thisChar == '-')){ } //every seq in otu is a '.' at this location, move to next column
174                 else { //this is a base in all locations, you are done
175                     for (int k = 0; k < ends.size(); k++) {
176                         if ((ends[k] == -1) && (seqA.otuData[cols[i]][0] != '.') && (seqA.otuData[cols[i]][0] != '-')){ ends[k] = i; numset++; } //any unset starts are set to this location
177                     }
178                     break;
179                 }
180             }else{
181                 for(int j=0;j<otu.numSeqs;j++){ //for each reference
182                     if((thisColumn[j] != '.') && (thisColumn[j] != '-') && (ends[j] == -1) && (seqA.otuData[cols[i]][0] != '.') && (seqA.otuData[cols[i]][0] != '-')) { //seq j hasn't set the start value and its a base
183                         ends[j] = i; numset++;
184                     }
185                 }
186             }
187         }
188 
189         return ends;
190     }
191     catch(exception& e) {
192         m->errorOut(e, "DistCalc", "setEndsIgnoreTermGap");
193         exit(1);
194     }
195 }
196 /***********************************************************************/
197 
setStarts(classifierOTU seqA,classifierOTU otu,vector<int> cols)198 vector<int> DistCalc::setStarts(classifierOTU seqA, classifierOTU otu, vector<int> cols){
199     try {
200         vector<int> starts; starts.resize(otu.numSeqs, 0);
201 
202         int alignLength = cols.size();
203 
204         int seqAStart = 0;
205         for(int i=0;i<alignLength;i++){ //for each column we want to include
206             if (seqA.otuData[cols[i]][0] != '.') {
207                 seqAStart = i; break;
208             }
209         }
210 
211         //set start positions
212         int numset = 0;
213         for(int i=0;i<alignLength;i++){ //for each column we want to include
214 
215             if (seqAStart <= i) { //our query seq starts before this point so set rest of unset starts to query start
216                 for (int k = 0; k < starts.size(); k++) {
217                     if (starts[k] == 0) { starts[k] = seqAStart; numset++; }
218                 }
219                 break;
220             }else if(numset == otu.numSeqs) { break; }
221 
222             vector<char> thisColumn = otu.otuData[cols[i]];
223             if (thisColumn.size() != otu.numSeqs) { //all seqs at this spot are identical
224 
225                 char thisChar = thisColumn[0];
226 
227                 if (thisChar == '.') { } //every seq in otu is a '.' at this location, move to next column
228                 else { //this is a base in all locations, you are done
229                     for (int k = 0; k < starts.size(); k++) {
230                         if (starts[k] == 0) { starts[k] = i; numset++; } //any unset starts are set to this location
231                     }
232                     break;
233                 }
234             }else{
235                 for(int j=0;j<otu.numSeqs;j++){ //for each reference
236                     if((thisColumn[j] != '.') && (starts[j] == 0)){ //seq j hasn't set the start value and its a base
237                         starts[j] = i; numset++;
238                     }
239                 }
240             }
241         }
242 
243         return starts;
244     }
245     catch(exception& e) {
246         m->errorOut(e, "DistCalc", "setStarts");
247         exit(1);
248     }
249 }
250 /***********************************************************************/
251 
setEnds(classifierOTU seqA,classifierOTU otu,vector<int> cols)252 vector<int> DistCalc::setEnds(classifierOTU seqA, classifierOTU otu, vector<int> cols){
253     try {
254         vector<int> ends; ends.resize(otu.numSeqs, 0);
255 
256         int alignLength = cols.size();
257 
258         int seqAEnd = 0;
259         for(int i=alignLength-1;i>=0;i--){//for each column we want to include
260             if (seqA.otuData[cols[i]][0] != '.') {
261                 seqAEnd = i; break;
262             }
263         }
264 
265         //set start positions
266         int numset = 0;
267         for(int i=alignLength-1;i>=0;i--){ //for each column we want to include
268 
269             if (seqAEnd <= i) { //our query seq starts before this point so set rest of unset starts to query start
270                 for (int k = 0; k < ends.size(); k++) {
271                     if (ends[k] == 0) { ends[k] = seqAEnd; numset++; }
272                 }
273                 break;
274             }else if(numset == otu.numSeqs) { break; }
275 
276             vector<char> thisColumn = otu.otuData[cols[i]];
277             if (thisColumn.size() != otu.numSeqs) { //all seqs at this spot are identical
278 
279                 char thisChar = thisColumn[0];
280 
281                 if (thisChar == '.') { } //every seq in otu is a '.' at this location, move to next column
282                 else { //this is a base in all locations, you are done
283                     for (int k = 0; k < ends.size(); k++) {
284                         if (ends[k] == 0) { ends[k] = i; numset++; } //any unset starts are set to this location
285                     }
286                     break;
287                 }
288             }else{
289                 for(int j=0;j<otu.numSeqs;j++){ //for each reference
290                     if((thisColumn[j] != '.') && (ends[j] == 0)){ //seq j hasn't set the start value and its a base
291                         ends[j] = i; numset++;
292                     }
293                 }
294             }
295         }
296 
297         return ends;
298     }
299     catch(exception& e) {
300         m->errorOut(e, "DistCalc", "setEnds");
301         exit(1);
302     }
303 }
304 
305 /***********************************************************************/
306 //nb1 and nb2 have size 1, unless amino acid = B or Z
predict(vector<int> nb1,vector<int> nb2,double & p,double & dp,double & d2p,double & tt,double eigs[20],double probs[20][20])307 void DistCalc::predict(vector<int> nb1, vector<int> nb2, double& p, double& dp, double& d2p, double& tt, double eigs[20], double probs[20][20]){
308     try {
309         double q;
310 
311         for (int i = 0; i < nb1.size(); i++) {
312 
313             for (int l = 0; l < 20; l++) {
314 
315                 double elambdat = exp(tt * eigs[l]);
316 
317                // printf("l = %ld, nb1 = %ld, nb2 = %ld\n", l, nb1[i], nb2[i]);
318                // printf("l = %ld, eig[m] = %f, prob[m][nb1 - 1] = %f, prob[m][nb2 - 1] = %f\n", l, eigs[l], probs[l][nb1[i] - 1], probs[l][nb2[i] - 1]);
319 
320 
321                 q = probs[l][nb1[i]-1] * probs[l][nb2[i]-1] * elambdat;
322                 p += q;
323 
324                 dp += eigs[l] * q;
325 
326                 double TEMP = eigs[l];
327 
328                 d2p += TEMP * TEMP * q;
329             }
330         }
331 
332         //printf("p = %f, q = %f, tt = %f\n", p, q, tt);
333        // printf("dp = %f, d2p = %f\n", dp, d2p);
334     }
335     catch(exception& e) {
336         m->errorOut(e,  "DistCalc", "predict");
337         exit(1);
338     }
339 }
340 /***********************************************************************/
341 //nb1 and nb2 have size 1, unless amino acid = B or Z
fillNums(vector<int> & numAs,vector<int> & numBs,int numA,int numB)342 void DistCalc::fillNums(vector<int>& numAs, vector<int>& numBs, int numA, int numB){
343     try {
344 
345         if (numA == asx) { //B asn or asp (3 or 4)
346           if (numB == asx) { //B asn or asp
347               numAs.push_back(3); numBs.push_back(3); //asn, asn
348               numAs.push_back(3); numBs.push_back(4); //asn, asp
349               numAs.push_back(4); numBs.push_back(3); //asp, asn
350               numAs.push_back(4); numBs.push_back(4); //asp, asp
351           }else {
352               if (numB == glx) { //Z gln or glu (6 or 7)
353                   numAs.push_back(3); numBs.push_back(6); //asn, gln
354                   numAs.push_back(3); numBs.push_back(7); //asn, glu
355                   numAs.push_back(4); numBs.push_back(6); //asp, gln
356                   numAs.push_back(4); numBs.push_back(7); //asp, glu
357               }else {
358                   if (numB < ser2) { numB++; }
359                   numAs.push_back(3); numBs.push_back(numB); //asn, numB
360                   numAs.push_back(4); numBs.push_back(numB); //asp, numB
361               }
362           }
363         }else {
364             if (numA == glx) { //Z gln or glu (6 or 7)
365                 if (numB == asx) { //B asn or asp
366                     numAs.push_back(6); numBs.push_back(3); //gln, asn
367                     numAs.push_back(6); numBs.push_back(4); //gln, asp
368                     numAs.push_back(7); numBs.push_back(3); //glu, asn
369                     numAs.push_back(7); numBs.push_back(4); //glu, asp
370                 }else {
371                     if (numB == glx) { //Z gln or glu (6 or 7)
372                         numAs.push_back(6); numBs.push_back(6); //gln, gln
373                         numAs.push_back(6); numBs.push_back(7); //gln, glu
374                         numAs.push_back(7); numBs.push_back(6); //glu, gln
375                         numAs.push_back(7); numBs.push_back(7); //glu, glu
376                     }else {
377                         if (numB < ser2) { numB++; }
378                         numAs.push_back(6); numBs.push_back(numB); //gln, numB
379                         numAs.push_back(7); numBs.push_back(numB); //glu, numB
380                     }
381                 }
382             }else {
383                 if (numA < ser2) { numA++; }
384                 if (numB == asx) { //B asn or asp
385                     numAs.push_back(numA); numBs.push_back(3); //numA, asn
386                     numAs.push_back(numA); numBs.push_back(4); //numA, asp
387                     numAs.push_back(numA); numBs.push_back(3); //numA, asn
388                     numAs.push_back(numA); numBs.push_back(4); //numA, asp
389                 }else if (numB == glx) { //Z gln or glu (6 or 7)
390                     numAs.push_back(numA); numBs.push_back(6); //numA, gln
391                     numAs.push_back(numA); numBs.push_back(7); //numA, glu
392                     numAs.push_back(numA); numBs.push_back(6); //numA, gln
393                     numAs.push_back(numA); numBs.push_back(7); //numA, glu
394                 }
395             }
396         }
397     }
398     catch(exception& e) {
399         m->errorOut(e,  "DistCalc", "fillNums");
400         exit(1);
401     }
402 }
403 /***********************************************************************/
makeDists(Protein A,Protein B,double eigs[20],double probs[20][20])404 double DistCalc::makeDists(Protein A, Protein B, double eigs[20], double probs[20][20]){
405     try {
406         int numBases = A.getAlignLength();
407         vector<AminoAcid> seqA = A.getAligned();
408         vector<AminoAcid> seqB = B.getAligned();
409 
410         bool inf = false; bool neginfinity = false; bool overlap = false;
411         double delta, lnlike, slope, curv, tt;
412         tt = 0.1; delta = tt / 2.0;
413 
414         for (int l = 0; l < 20; l++) {
415 
416             //reset for this attempt
417             lnlike = 0.0; slope = 0.0; curv = 0.0; neginfinity = false; overlap = false;
418             double oldweight = 1.0;
419 
420             if (m->getControl_pressed()) { break; }
421 
422             for (int i = 0; i < numBases; i++) {
423                 int numA = seqA[i].getNum();
424                 int numB = seqB[i].getNum();
425 
426                 if (numA != stop && numA != del && numA != quest && numA != unk &&
427                     numB != stop && numB != del && numB != quest && numB != unk) {
428 
429                     double p = 0.0; double dp = 0.0; double d2p = 0.0; overlap = true;
430 
431                     vector<int> numAs; vector<int> numBs;
432                     if (numA != asx && numA != glx && numB != asx && numB != glx) {
433                         if (numA < ser2) { numA++; }
434                         if (numB < ser2) { numB++; }
435                         numAs.push_back(numA); numBs.push_back(numB); //+1 avoid 0
436                     }else {
437                         fillNums(numAs, numBs, numA, numB);
438                     }
439 
440                     predict(numAs, numBs, p, dp, d2p, tt, eigs, probs);
441 
442                     if (p <= 0.0) {
443                         neginfinity = true;
444                     }else {
445                         slope += oldweight*dp / p;
446                         curv += oldweight*(d2p / p - dp * dp / (p * p));
447 
448                         //printf("%ld:%ld, dp = %f, p = %f, d2p = %f\n", l, i, dp, p, d2p);
449 
450                         //printf("%ld:%ld, slope = %f, curv = %f, oldweight[i] = %ld\n", l, i, slope, curv, oldweight);
451                     }
452                 }//endif stop
453             }//endif bases
454 
455             if (!overlap) {
456                 tt = -1.0; l += 20; inf = true;
457             }else if (!neginfinity) {
458                 if (curv < 0.0) {
459                     tt -= slope / curv;
460                     if (tt > 10000.0) { tt = -1.0; l += 20; inf = true;  }
461                 }else {
462                     if ((slope > 0.0 && delta < 0.0) || (slope < 0.0 && delta > 0.0)) { delta /= -2; }
463                     tt += delta;
464                 }
465             }else {
466                 delta /= -2;
467                 tt += delta;
468             }
469 
470             if (tt < 0.00001 && !inf) { tt = 0.00001; }
471 
472         }//endif attempts
473 
474         dist = tt;
475         //exit(1);
476 
477         return dist;
478     }
479     catch(exception& e) {
480         m->errorOut(e,  "DistCalc", "makeDists");
481         exit(1);
482     }
483 }
484 /***********************************************************************/
485