1RequireVersion("2.3.0");
2
3LoadFunctionLibrary("libv3/convenience/matrix.bf");
4LoadFunctionLibrary("libv3/UtilityFunctions.bf");
5
6/* define various genetic code translation tables
7
8   Table definitions used here can be found on the NCBI web page at
9   https://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi
10
11  	here's how HyPhy codes translate to aminoacids
12
13 	0 == Phe
14 	1 == Leu
15 	2 == Ile
16 	3 == Met
17 	4 == Val
18 	5 == Ser
19 	6 == Pro
20 	7 == Thr
21 	8 == Ala
22 	9 == Tyr
23 	10 == Stop
24 	11 == His
25 	12 == Gln
26 	13 == Asn
27 	14 == Lys
28 	15 == Asp
29 	16 == Glu
30 	17 == Cys
31 	18 == Trp
32 	19 == Arg
33 	20 == Gly
34
35 	AAA,AAC,AAG....TTA,TTC,TTG,TTT - 64 all in all*/
36
37
38/* defines model states which are not allowed, i.e. termination codons.
39   GeneticCodeExclusions string is used by DataSetFilter to
40   eliminate "illegal" states from the data */
41
42
43
44_geneticCodeOptionMatrix = {
45    {
46        "Universal", "Universal code. (Genebank transl_table=1)."
47    } {
48        "Vertebrate-mtDNA", "Vertebrate mitochondrial DNA code. (Genebank transl_table=2)."
49    } {
50        "Yeast-mtDNA", "Yeast mitochondrial DNA code. (Genebank transl_table=3)."
51    } {
52        "Mold-Protozoan-mtDNA", "Mold, Protozoan and Coelenterate mitochondrial DNA and the Mycloplasma/Spiroplasma code. (Genebank transl_table=4)."
53    } {
54        "Invertebrate-mtDNA", "Invertebrate mitochondrial DNA code. (Genebank transl_table=5)."
55    } {
56        "Ciliate-Nuclear", "Ciliate, Dasycladacean and Hexamita Nuclear code. (Genebank transl_table=6)."
57    } {
58        "Echinoderm-mtDNA", "Echinoderm mitochondrial DNA code. (Genebank transl_table=9)."
59    } {
60        "Euplotid-Nuclear", "Euplotid Nuclear code. (Genebank transl_table=10)."
61    } {
62        "Alt-Yeast-Nuclear", "Alternative Yeast Nuclear code. (Genebank transl_table=12)."
63    } {
64        "Ascidian-mtDNA", "Ascidian mitochondrial DNA code. (Genebank transl_table=13)."
65    } {
66        "Flatworm-mtDNA", "Flatworm mitochondrial DNA code. (Genebank transl_table=14)."
67    } {
68        "Blepharisma-Nuclear", "Blepharisma Nuclear code. (Genebank transl_table=15)."
69    } {
70        "Chlorophycean-mtDNA", "Chlorophycean Mitochondrial Code (transl_table=16)."
71    } {
72        "Trematode-mtDNA", "Trematode Mitochondrial Code (transl_table=21)."
73    } {
74        "Scenedesmus-obliquus-mtDNA", "Scenedesmus obliquus mitochondrial Code (transl_table=22)."
75    } {
76        "Thraustochytrium-mtDNA", "Thraustochytrium Mitochondrial Code (transl_table=23)."
77    } {
78        "Pterobranchia-mtDNA", "Pterobranchia Mitochondrial Code (transl_table=24)."
79    } {
80        "SR1-and-Gracilibacteria", "Candidate Division SR1 and Gracilibacteria Code (transl_table=25)."
81    } {
82        "Pachysolen-Nuclear", "Pachysolen tannophilus Nuclear Code (transl_table=26)."
83    }{
84        "Mesodinium-Nuclear", "Mesodinium Nuclear Code (transl_table=29)"
85    }{
86        "Peritrich-Nuclear", "Peritrich Nuclear Code (transl_table=30)"
87    }{
88        "Cephalodiscidae-mtDNA", "Cephalodiscidae Mitochondrial UAA-Tyr Code (transl_table=33)"
89    }
90
91};
92
93_hyphyAAOrdering        = "FLIMVSPTAYXHQNKDECWRG";
94_alphabeticalAAOrdering = "ACDEFGHIKLMNPQRSTVWY";
95
96_singleAALetterToFullName = {
97    "A": "Alanine",
98    "C": "Cysteine",
99    "D": "Aspartic Acid",
100    "E": "Glutamic Acid",
101    "F": "Phenylalanine",
102    "G": "Glycine",
103    "H": "Histidine",
104    "I": "Isoleucine",
105    "K": "Lysine",
106    "L": "Leucine",
107    "M": "Methionine",
108    "N": "Aspargine",
109    "P": "Proline",
110    "Q": "Glutamine",
111    "R": "Arginine",
112    "S": "Serine",
113    "T": "Theronine",
114    "V": "Valine",
115    "W": "Tryptophan",
116    "Y": "Tyrosine",
117    "X": "Stop Codon"
118};
119
120if (!skipCodeSelectionStep) {
121
122    ChoiceList(modelType, "Choose Genetic Code", 1, SKIP_NONE, _geneticCodeOptionMatrix);
123
124    if (modelType < 0) {
125        return;
126    }
127
128    ApplyGeneticCodeTable(modelType);
129}
130
131genetic_code.stop_code = 10;
132
133/*----------------------------------------------------------------------------------------------------------*/
134
135MapCodonIndex.lookup = {"A" : 0, "C" : 1, "G" : 2, "T" : 3};
136
137lfunction MapCodonIndex(codon_string) {
138    codon_string = codon_string && 1;
139    codon = 0;
140    for (k = 0; k < 3; k+=1) {
141        codon += (^"MapCodonIndex.lookup")[codon_string[k]] * (4^(2-k));
142    }
143    return (codon+0.5)$1;
144}
145
146
147/*----------------------------------------------------------------------------------------------------------*/
148
149function CountSenseCodons(code) {
150    return +code["_MATRIX_ELEMENT_VALUE_!=genetic_code.stop_code"];
151}
152
153/*----------------------------------------------------------------------------------------------------------*/
154
155
156function ApplyGeneticCodeTable(myModelType) {
157    _Genetic_Code = {
158        {
159            14, /*AAA*/ 13, /*AAC*/ 14, /*AAG*/ 13, /*AAT*/
160            7, /*ACA*/ 7, /*ACC*/ 7, /*ACG*/ 7, /*ACT*/
161            19, /*AGA*/ 5, /*AGC*/ 19, /*AGG*/ 5, /*AGT*/
162            2, /*ATA*/ 2, /*ATC*/ 3, /*ATG*/ 2, /*ATT*/
163            12, /*CAA*/ 11, /*CAC*/ 12, /*CAG*/ 11, /*CAT*/
164            6, /*CCA*/ 6, /*CCC*/ 6, /*CCG*/ 6, /*CCT*/
165            19, /*CGA*/ 19, /*CGC*/ 19, /*CGG*/ 19, /*CGT*/
166            1, /*CTA*/ 1, /*CTG*/ 1, /*CTC*/ 1, /*CTT*/
167            16, /*GAA*/ 15, /*GAC*/ 16, /*GAG*/ 15, /*GAT*/
168            8, /*GCA*/ 8, /*GCC*/ 8, /*GCG*/ 8, /*GCT*/
169            20, /*GGA*/ 20, /*GGC*/ 20, /*GGG*/ 20, /*GGT*/
170            4, /*GTA*/ 4, /*GTC*/ 4, /*GTG*/ 4, /*GTT*/
171            10, /*TAA*/ 9, /*TAC*/ 10, /*TAG*/ 9, /*TAT*/
172            5, /*TCA*/ 5, /*TCC*/ 5, /*TCG*/ 5, /*TCT*/
173            10, /*TGA*/ 17, /*TGC*/ 18, /*TGG*/ 17, /*TGT*/
174            1, /*TTA*/ 0, /*TTC*/ 1, /*TTG*/ 0 /*TTT*/
175        }
176    };
177
178
179    GeneticCodeExclusions = "TAA,TAG,TGA";
180
181    if (myModelType == 1)
182    /* Vertebrate mtDNA */
183    {
184        _Genetic_Code[8] = 10; /* AGA => stop */
185        _Genetic_Code[10] = 10; /* AGG => stop */
186        _Genetic_Code[12] = 3; /* ATA => Met  */
187        _Genetic_Code[56] = 18; /* TGA => Trp  */
188
189        GeneticCodeExclusions = "AGA,AGG,TAA,TAG";
190        return myModelType;
191    }
192
193    if (myModelType == 2)
194    /* Yeast mtDNA */
195    {
196        _Genetic_Code[12] = 3; /* ATA => Met */
197        _Genetic_Code[28] = 7; /* CTA => Thr */
198        _Genetic_Code[29] = 7; /* CTC => Thr */
199        _Genetic_Code[30] = 7; /* CTG => Thr */
200        _Genetic_Code[31] = 7; /* CTT => Thr */
201        _Genetic_Code[56] = 18; /* TGA => Trp */
202
203        GeneticCodeExclusions = "TAA,TAG";
204        return myModelType;
205   }
206
207    if (myModelType == 3)
208    /* Mold,Protozoan and Coelenterate mtDNA */
209    {
210        _Genetic_Code[56] = 18; /* TGA => Trp */
211        GeneticCodeExclusions = "TAA,TAG";
212        return myModelType;
213    }
214
215    if (myModelType == 4)
216    /* Invertebrate mtDNA */
217    {
218        _Genetic_Code[8] = 5; /* AGA => Ser  */
219        _Genetic_Code[10] = 5; /* AGG => Ser  */
220        _Genetic_Code[12] = 3; /* ATA => Met  */
221        _Genetic_Code[56] = 18; /* TGA => Trp  */
222
223        GeneticCodeExclusions = "TAA,TAG";
224        return myModelType;
225    }
226
227    if (myModelType == 5)
228    /* Ciliate Nuclear Code */
229    {
230        _Genetic_Code[48] = 12; /* TAA => Gln  */
231        _Genetic_Code[50] = 12; /* TAG => Gln  */
232
233        GeneticCodeExclusions = "TGA";
234        return myModelType;
235    }
236
237    if (myModelType == 6)
238    /* Echinoderm mtDNA */
239    {
240        _Genetic_Code[0] = 13; /* AAA => Asn  */
241        _Genetic_Code[8] = 5; /* AGA => Ser  */
242        _Genetic_Code[10] = 5; /* AGG => Ser  */
243        _Genetic_Code[56] = 18; /* TGA => Trp  */
244
245        GeneticCodeExclusions = "TAA,TAG";
246        return myModelType;
247    }
248
249    if (myModelType == 7)
250    /* Euplotid mtDNA */
251    {
252        _Genetic_Code[56] = 17; /* TGA => Cys  */
253
254        GeneticCodeExclusions = "TAA,TAG";
255        return myModelType;
256    }
257
258    if (myModelType == 8)
259    /* Alternative Yeast Nuclear */
260    {
261        _Genetic_Code[30] = 5; /* CTG => Ser  */
262
263        GeneticCodeExclusions = "TAA,TAG,TGA";
264        return myModelType;
265    }
266
267    if (myModelType == 9)
268    /* Ascidian mtDNA */
269    {
270        _Genetic_Code[8] = 20; /* AGA => Gly  */
271        _Genetic_Code[10] = 20; /* AGG => Gly  */
272        _Genetic_Code[12] = 3; /* AGG => Met */
273        _Genetic_Code[56] = 18; /* TGA => Trp  */
274
275        GeneticCodeExclusions = "TAA,TAG";
276        return myModelType;
277    }
278
279    if (myModelType == 10)
280    /* Flatworm mtDNA */
281    {
282        _Genetic_Code[0] = 13; /* AAA => Asn  */
283        _Genetic_Code[8] = 5; /* AGA => Ser  */
284        _Genetic_Code[10] = 5; /* AGG => Ser  */
285        _Genetic_Code[48] = 9; /* TAA => Tyr */
286        _Genetic_Code[56] = 18; /* TGA => Trp  */
287
288        GeneticCodeExclusions = "TAG";
289        return myModelType;
290    }
291
292    if (myModelType == 11)
293    /* Blepharisma Nuclear */
294    {
295        _Genetic_Code[50] = 12; /* TAG => Gln  */
296
297        GeneticCodeExclusions = "TAA,TGA";
298        return myModelType;
299   }
300
301
302    if (myModelType == 12)
303    /* Chlorophycean Mitochondrial Code */
304    {
305        _Genetic_Code[50] = 1; /* TAG => Leu  */
306
307        GeneticCodeExclusions = "TAA,TGA";
308        return myModelType;
309   }
310
311    if (myModelType == 13)
312    /* Trematode Mitochondrial Code */
313    {
314        _Genetic_Code[56] = 18; /* TGA => Trp  */
315        _Genetic_Code[12] = 3; /* ATA => Met  */
316        _Genetic_Code[8] = 5; /* AGA => Ser  */
317        _Genetic_Code[10] = 5; /* AGG => Trp  */
318        _Genetic_Code[0] = 13; /* AAA => Asn  */
319
320        GeneticCodeExclusions = "TAA,TAG";
321        return myModelType;
322    }
323
324    if (myModelType == 14)
325    /*  Scenedesmus obliquus mitochondrial Code */
326    {
327        _Genetic_Code[52] = 10; /* TCA => Stop  */
328        _Genetic_Code[50] = 1; /* TAG => Leu  */
329
330        GeneticCodeExclusions = "TAA,TCA,TGA";
331        return myModelType;
332    }
333
334    if (myModelType == 15)
335    /*  Thraustochytrium mtDNA */
336    {
337        _Genetic_Code[60] = 10; /* TTA => Stop  */
338
339        GeneticCodeExclusions = "TAA,TAG,TGA,TTA";
340        return myModelType;
341    }
342
343    if (myModelType == 16)
344    /*   Pterobranchia mtDNA */
345    {
346        _Genetic_Code[56]  = 18; /* TGA => Trp  */
347        _Genetic_Code[8]  = 5; /* AGA => Serine */
348        _Genetic_Code[10]  = 14; /* AGG => Lysine */
349
350        GeneticCodeExclusions = "TAA,TAG";
351        return myModelType;
352    }
353
354    if (myModelType == 17)
355    /*   Candidate Division SR1 */
356    {
357        _Genetic_Code[56]  = 20; /* TGA => Gly  */
358
359        GeneticCodeExclusions = "TAA,TAG";
360        return myModelType;
361    }
362
363    if (myModelType == 18)
364    /*   Pachysolen tannophilus nuclear */
365    {
366        _Genetic_Code[30]  = 8; /* CTG => Ala  */
367
368        GeneticCodeExclusions = "TAA,TAG,TGA";
369        return myModelType;
370    }
371
372    if (myModelType == 19)
373    /*   Mesodinium Nuclear */
374    {
375        _Genetic_Code[48]  = 9; /* TAA => Tyr  */
376        _Genetic_Code[50]  = 9; /* TAG => Tyr  */
377
378        GeneticCodeExclusions = "TGA";
379        return myModelType;
380    }
381
382    if (myModelType == 20)
383    /*   Peritrich Nuclear */
384    {
385        _Genetic_Code[48]  = 16; /* TAA => Glu  */
386        _Genetic_Code[50]  = 16; /* TAG => Glu  */
387
388        GeneticCodeExclusions = "TGA";
389        return myModelType;
390    }
391
392    if (myModelType == 21)
393    /*   Cephalodiscidae Mitochondrial */
394    {
395        _Genetic_Code[48]  = 9; /* TAA => Tyr  */
396        _Genetic_Code[56]  = 18; /* TGA => Trp  */
397        _Genetic_Code[8]  = 5; /* AGA => Ser  */
398        _Genetic_Code[10]  = 14; /* AGG => Lys  */
399
400        GeneticCodeExclusions = "TAG";
401        return myModelType;
402    }
403
404
405    return myModelType;
406}
407
408/*----------------------------------------------------------------------------------------------------------*/
409
410function mapCodonsToAAGivenMappingAux(codonSeq, aaSequence, mapping, this_many_mm) {
411    seqLen = Abs(aaSequence);
412    translString = "";
413    translString * (seqLen);
414    seqLenN = Abs(codonSeq);
415
416    aaPos = 0;
417    seqPos = 0;
418    codon = codonSeq[seqPos][seqPos + 2];
419    currentAA = mapping[codon];
420    mismatch_count = 0;
421
422    for (aaPos = 0; aaPos < seqLen && seqPos < seqLenN; aaPos += 1) {
423        advance = 1;
424        copy_codon = 1;
425
426        if (currentAA != 0) {
427            if (aaSequence[aaPos] == "-") {
428                if (currentAA != "X") {
429                    translString * "---";
430                    advance = 0;
431                }
432            } else {
433                mismatch_count += (aaSequence[aaPos] != currentAA);
434                if (this_many_mm == 1) {
435                    assert(mismatch_count < this_many_mm, "A mismatch between codon and protein sequences at position " + aaPos + " (codon `seqPos`) : codon '" + codonSeq[seqPos][seqPos + 2] + "'(`currentAA`) a.a. '`aaSequence[aaPos]`'");
436                } else {
437                    if (mismatch_count >= this_many_mm) {
438                        translString * 0;
439                        return None;
440                    }
441                }
442            }
443        } else {
444            copy_codon = 0;
445        }
446
447        if (advance) {
448            if (copy_codon) {
449                if (currentAA == "X") {
450                    translString * "---";
451                } else {
452                    translString * codon;
453                }
454            } else {
455                //fprintf (stdout, "Skipping codon ", codon, "\n");
456                aaPos = aaPos - 1;
457            }
458            seqPos += 3;
459            codon = codonSeq[seqPos][seqPos + 2];
460            currentAA = mapping[codon];
461        }
462    }
463
464
465    translString * 0;
466
467    return translString;
468}
469
470/*----------------------------------------------------------------------------------------------------------*/
471
472function mapCodonsToAAGivenMapping(codonSeq, aaSequence, mapping) {
473    return mapCodonsToAAGivenMappingAux(codonSeq, aaSequence, mapping, 1);
474}
475
476
477/*----------------------------------------------------------------------------------------------------------*/
478
479function mapCodonsToAA(codonSeq, aaSequence) {
480    return mapCodonsToAAGivenMapping(codonSeq, aaSequence, defineCodonToAA());
481}
482
483/*----------------------------------------------------------------------------------------------------------*/
484
485function mapCodonsToAAFuzzy(codonSeq, aaSequence, mismatches) {
486    return mapCodonsToAAGivenMappingAux(codonSeq, aaSequence, defineCodonToAA(), mismatches);
487}
488
489
490/*----------------------------------------------------------------------------------------------------------*/
491
492lfunction CompareCodonProperties(codon1, codon2, code)
493    /* given:
494    		 codon1 (a number between 0 and 63 in AAA...TTT encoding),
495    		 codon2 (same encoding),
496    		 code (the genetic code)
497
498    	returns a dictionary with the following keys:
499
500    		"NONSYNONYMOUS" : [BOOLEAN] set to 1 if codon1 <-> codon2 is a non-synynomous substitution, otherwise 0
501    		"DIFFERENCES"   : [INTEGER 0,1,2,3] set to the number of nucleotide differences
502    		"BY_POSITION"	: [BOOLEAN MATRIX] a 1x3 matrix, where the i-th entry is 1 if the corresponding nucleotide position is different between the codons
503    		"1"				: [1x2 MATRIX]	   nucleotide substitution in position 1 (from -> to) encoded as an index into "ACGT"
504    										   for example, codon1 = TCT, codon 2 = GCT, this matrix will be {{3,2}}
505
506    		"2"				: ... same for the second position
507    		"3"				: ... same for the third  position
508    */
509
510{
511    _codonCompResult = {};
512
513    _codonCompResult["NONSYNONYMOUS"] = (code[codon1] != code[codon2]);
514    _codonCompResult["BY_POSITION"] = {
515        1, 2
516    };
517
518    _positionMatrix = {
519        {
520            codon1 % 4, codon2 % 4
521        }
522    };
523
524    for (_ci = 0; _ci < 3; _ci += 1) {
525
526        _codonCompResult[1 + _ci] = Eval(_positionMatrix);
527        (_codonCompResult["BY_POSITION"])[_ci] = (_positionMatrix[0] != _positionMatrix[1]);
528
529        codon1 = codon1 $ 4;
530        codon2 = codon2 $ 4;
531    }
532
533    _codonCompResult["DIFFERENCES"] = (_codonCompResult["BY_POSITION"])[0] + (_codonCompResult["BY_POSITION"])[1] + (_codonCompResult["BY_POSITION"])[2];
534
535    return _codonCompResult;
536}
537
538
539
540/*----------------------------------------------------------------------------------------------------------*/
541
542function defineCodonToAA() {
543    return defineCodonToAAGivenCode(_Genetic_Code);
544}
545
546/*----------------------------------------------------------------------------------------------------------*/
547
548function defineCodonToAAGivenCode(code) {
549    codonToAAMap = {};
550    nucChars     = "ACGT";
551
552    for (p1 = 0; p1 < 64; p1 += 1) {
553        codonToAAMap[nucChars[p1$16] + nucChars[p1 % 16 $4] + nucChars[p1 % 4]] = _hyphyAAOrdering[code[p1]];
554    }
555
556    return codonToAAMap;
557}
558
559/*----------------------------------------------------------------------------------------------------------*/
560
561function findAllCodonsForAA(aa) {
562    codonsForAA = {};
563
564    for (p1 = 0; p1 < 64; p1 = p1 + 1) {
565        if (_hyphyAAOrdering[_Genetic_Code[p1]] == aa) {
566            codonsForAA[p1] = 1;
567        }
568    }
569
570    return codonsForAA;
571}
572
573/*----------------------------------------------------------------------------------------------------------*/
574
575function RawToSense(code)
576/*
577    given:
578    		genetic code,
579
580    returns a 64x1 matrix mapping raw codons to sense codons only (stops are mapped to -1)
581*/
582{
583    _codonMap = {
584        64, 1
585    };
586
587    _cShift = 0;
588    for (_ci = 0; _ci < 64; _ci = _ci + 1) {
589        if (code[_ci] == genetic_code.stop_code) {
590            _cShift = _cShift + 1;
591            _codonMap[_ci] = -1;
592        } else {
593            _codonMap[_ci] = _ci - _cShift;
594        }
595    }
596
597    return _codonMap;
598}
599
600
601/*----------------------------------------------------------------------------------------------------------*/
602
603function IsTransition(pair)
604/*
605    given:
606    		a pair of nucleotides (as a 1x2 matrix, e.g. as returned by CompareCodonProperties["1"]),
607
608    returns 1 if the substitution is a transition
609    returns -1 if the substitution is a transversion
610
611    RETURNS 0 IF NO SUBSTITUTION TOOK PLACE
612*/
613{
614    if (pair[0] != pair[1]) {
615        if (Abs(pair[0] - pair[1]) % 2 == 0) {
616            return 1;
617        }
618        return -1;
619    }
620    return 0;
621}
622
623/*----------------------------------------------------------------------------------------------------------*/
624
625lfunction IsStop(codon, code)
626
627/*
628	given:
629		 codon (a number between 0 and 63 in AAA...TTT encoding)
630		 code (the genetic code)
631
632	returns
633		 whether or not the codon is a stop codon
634*/
635
636{
637    return code[codon] == ^ "genetic_code.stop_code";
638}
639
640/*----------------------------------------------------------------------------------------------------------*/
641
642function translateCodonToAA(codonSeq, mapping, offset) {
643    seqLen = Abs(codonSeq);
644    translString = "";
645    translString * (seqLen / 3 + 1);
646    for (seqPos = offset; seqPos < seqLen; seqPos += 3) {
647        codon = codonSeq[seqPos][seqPos + 2];
648        prot = mapping[codon];
649        if (Abs(prot)) {
650            translString * prot;
651        } else {
652            if (codon == "---") {
653                translString * "-";
654            } else {
655                translString * "?";
656            }
657        }
658    }
659    translString * 0;
660    translString = translString ^ {
661        {
662            "X$", "?"
663        }
664    };
665
666    return translString;
667}
668
669
670
671/*----------------------------------------------------------------------------------------------------------*/
672
673lfunction ComputeCodonCodeToStringMap(genCode) {
674    _codonMap = {};
675    _nucLetters = "ACGT";
676    for (_idx = 0; _idx < Columns(genCode); _idx += 1) {
677        if (genCode[_idx] != ^ "genetic_code.stop_code") {
678            _codonMap + (_nucLetters[_idx$16] + _nucLetters[(_idx % 16) $4] + _nucLetters[_idx % 4]);
679        }
680    }
681    return _codonMap;
682}
683
684/*----------------------------------------------------------------------------------------------------------*/
685
686lfunction ComputeCodonCodeToStringMapStop (genCode) {
687	_codonMap = {};
688	_nucLetters = "ACGT";
689	for (_idx = 0; _idx < Columns(genCode); _idx += 1) {
690		if (genCode[_idx] == ^ "genetic_code.stop_code") {
691			_codonMap + (_nucLetters[_idx$16] + _nucLetters[(_idx%16)$4] + _nucLetters[_idx%4]);
692		}
693	}
694	return _codonMap;
695}
696
697/*----------------------------------------------------------------------------------------------------------*/
698
699lfunction genetic_code.partition_codon(codon) {
700    return Eval({
701        {
702            codon$16, (codon % 16) $4, codon % 4
703        }
704    });
705}
706
707lfunction genetic_code.assemble_codon(positions) {
708    return positions[0] * 16 + positions[1] * 4 + positions[2];
709}
710
711/*----------------------------------------------------------------------------------------------------------*/
712
713lfunction genetic_code.ComputeBranchLengthStencils(genCode) {
714 /*
715            given a genetic code (`genCode`), computes a matrix of N x N entries (N = sense codons)
716            where a value of 1 in cell (i,j) means that i <-> j is a substitution of a specific type
717            (e.g. synonymous or non-synonymous), and a value of 0 is assigned to all other cells.
718
719            returns a dictionary with 'synonymous' and 'non-synonymous' keys
720
721            Also see inline comments
722
723        */
724
725    sense_codons = CountSenseCodons (genCode);
726    SS = {sense_codons, sense_codons};
727    NS = {sense_codons, sense_codons};
728
729    stop_code = ^ "genetic_code.stop_code";
730    codon_offset = 0;
731
732
733    for (codon = 0; codon < 64; codon += 1) {
734        if (genCode[codon] == stop_code) {
735            codon_offset += 1;
736        } else {
737            aa1 = genCode [codon];
738            codon_offset2 = codon_offset;
739            for (codon2 = codon + 1; codon2 < 64; codon2 += 1) {
740                if (genCode [codon2] == stop_code) {
741                    codon_offset2 += 1;
742                } else {
743                    if (aa1 == genCode [codon2]) {
744                        SS [codon-codon_offset][codon2-codon_offset2] = 1;
745                    } else {
746                        NS [codon-codon_offset][codon2-codon_offset2] = 1;
747                    }
748                }
749            }
750        }
751    }
752
753    matrix.Symmetrize(SS);
754    matrix.Symmetrize(NS);
755
756    return {"synonymous" : SS, "non-synonymous" : NS};
757
758}
759
760/*----------------------------------------------------------------------------------------------------------*/
761
762lfunction genetic_code.DefineCodonToAAMapping (code) {
763    codonToAAMap = {};
764    nucChars = "ACGT";
765
766    for (p = 0; p < 64; p += 1) {
767        codonToAAMap[nucChars[p$16] + nucChars[p % 16 $4] + nucChars[p % 4]] = (^"_hyphyAAOrdering")[code[p]];
768    }
769
770    return codonToAAMap;
771}
772
773
774/*----------------------------------------------------------------------------------------------------------*/
775
776lfunction genetic_code.DefineIntegerToAAMapping (code, only_sense) {
777    codon_code_map = {};
778
779    shift = 0;
780
781    for (p = 0; p < 64; p += 1) {
782        if (IsStop (p, code)) {
783            if (only_sense) {
784               shift += 1;
785               continue;
786            }
787        }
788
789        codon_code_map[p-shift] = (^"_hyphyAAOrdering")[code[p]];
790    }
791
792    return codon_code_map;
793}
794
795/*----------------------------------------------------------------------------------------------------------*/
796
797lfunction genetic_code.ComputePairwiseDifferencesAndExpectedSites(genCode, options) {
798        /*
799            given a genetic code (`genCode`), computes a number of per-codon (or per pair of codons)
800            quantities that relate to numbers of synonymous and non-synonymous sites or
801            substitutions.
802
803            `options` can be null, or have any of the following keys:
804
805                `weighting-matrix` is expected to be a set of 3 4x4 matrices showing relative frequencies of
806                various nucleotide->nucleotide substitutions stratified by codon position; by default they
807                are all equal
808
809                `count-stop-codons` treat mutations to stop codons as non-synonymous changes for counting purposes
810                (by the default they are not counted at all)
811
812            Also see inline comments
813
814        */
815
816        SS = {
817            64, 1
818        }; // raw codon index -> # of synonymous sites     [0-3]
819        NS = SS; // raw codon index -> # of non-synonymous sites [0-3]
820
821        stop_code = ^ "genetic_code.stop_code";
822
823        if (Type(options["weighting-matrix"]) == "AssociativeList") {
824            weighting_matrix = options["weighting-matrix"];
825        } else {
826            equal = {
827                4, 4
828            }["1"];
829            weighting_matrix = {};
830            weighting_matrix + equal;
831            weighting_matrix + equal;
832            weighting_matrix + equal;
833        }
834
835        keep_stop_codons = FALSE;
836
837        if (Type(options["count-stop-codons"]) == "Number") {
838            keep_stop_codons = options["count-stop-codons"];
839        }
840
841        codon_offset = 0;
842
843        for (codon = 0; codon < 64; codon += 1) {
844
845            if (genCode[codon] == stop_code) {
846                codon_offset += 1;
847            } else {
848
849                codon_info = genetic_code.partition_codon(codon);
850                aa = genCode[codon];
851
852                for (codon_position = 0; codon_position < 3; codon_position += 1) {
853                    norm_factor = 0.0;
854                    sSites = 0.0;
855                    nsSites = 0.0;
856                    // mutagenize 'codon' at 'codon_position'
857
858                    copy_codon = codon_info;
859                    for (new_nuc = 0; new_nuc < 4; new_nuc += 1) {
860                        if (new_nuc != codon_info[codon_position]) {
861                            copy_codon[codon_position] = new_nuc;
862                            new_codon = genetic_code.assemble_codon(copy_codon);
863                            w = (weighting_matrix[codon_position])[codon_info[codon_position]][new_nuc];
864                            if (keep_stop_codons || stop_code == genCode[new_codon] == 0) {
865                                if (genCode[new_codon] != aa) {
866                                    nsSites += w;
867                                } else {
868                                    sSites += w;
869                                }
870                            }
871                            norm_factor += w;
872                        }
873                    }
874
875                    if (norm_factor > 0) {
876                        SS[codon] += sSites / norm_factor;
877                        NS[codon] += nsSites / norm_factor;
878                    }
879
880                }
881            }
882        }
883
884        senseCodonCount = 64 - codon_offset;
885
886        EPS = {
887            senseCodonCount, senseCodonCount
888        };
889        EPN = EPS;
890        OPS = EPS;
891        OPN = EPS;
892        NTP = EPS["-1"];
893
894        empty_dict = {};
895
896        codon_offset_1 = 0;
897
898        all_permutations = {
899            "0": {
900                {
901                    0, 1, 2
902                }
903            },
904            "1": {
905                {
906                    0, 2, 1
907                }
908            },
909            "2": {
910                {
911                    1, 0, 2
912                }
913            },
914            "3": {
915                {
916                    1, 2, 0
917                }
918            },
919            "4": {
920                {
921                    2, 0, 1
922                }
923            },
924            "5": {
925                {
926                    2, 1, 0
927                }
928            }
929        };
930
931        ntp_matrix = {
932            {
933                0, 0, 1, 2
934            } {
935                0, 0, 3, 4
936            } {
937                0, 0, 0, 5
938            } {
939                0, 0, 0, 0
940            }
941        };
942
943        matrix.Symmetrize(ntp_matrix);
944
945
946        for (codon_1 = 0; codon_1 < 64; codon_1 += 1) {
947            if (genCode[codon_1] == stop_code) {
948                codon_offset_1 += 1;
949                continue;
950            }
951
952            codon_info_1 = genetic_code.partition_codon(codon_1);
953            aa_1 = genCode[codon_1];
954            direct_index_1 = codon_1 - codon_offset_1;
955
956            EPS[direct_index_1][direct_index_1] = SS[codon_1];
957            EPN[direct_index_1][direct_index_1] = NS[codon_1];
958
959            codon_offset_2 = codon_offset_1;
960
961
962            for (codon_2 = codon_1 + 1; codon_2 < 64; codon_2 += 1) {
963                if (genCode[codon_2] == stop_code) {
964                    codon_offset_2 += 1;
965                    continue;
966                }
967
968
969                codon_info_2 = genetic_code.partition_codon(codon_2);
970                aa_2 = genCode[codon_2];
971                direct_index_2 = codon_2 - codon_offset_2;
972
973
974                path_count = 0;
975                eps = 0;
976                epn = 0;
977                ops = 0;
978                opn = 0;
979                ntp = None;
980
981                for (path = 0; path < 6; path += 1) {
982                    current_codon = codon_info_1;
983                    current_aa = aa_1;
984                    codon_sequence = empty_dict;
985                    codon_sequence + codon_1;
986
987                    ps = 0;
988                    pn = 0;
989
990                    for (path_step = 0; path_step < 3; path_step += 1) {
991                        change_index = (all_permutations[path])[path_step];
992                        if (current_codon[change_index] != codon_info_2[change_index]) {
993                            current_codon[change_index] = codon_info_2[change_index];
994                            current_codon_index = genetic_code.assemble_codon(current_codon);
995                            next_aa = genCode[current_codon_index];
996                            if (next_aa == stop_code) {
997                                break;
998                            }
999                            codon_sequence + current_codon_index;
1000                            if (current_aa == next_aa) {
1001                                ps += 1;
1002                            } else {
1003                                pn += 1;
1004                            }
1005                            current_aa = next_aa;
1006                        }
1007                    }
1008
1009                    if (path_step == 3) {
1010                        path_count += 1;
1011                        path_length = Abs(codon_sequence);
1012
1013                        if (path_length == 2 && ntp == None) {
1014                            for (position = 0; position < 3; position += 1) {
1015                                if (codon_info_1[position] != codon_info_2[position]) {
1016                                    ntp = ntp_matrix[codon_info_1[position]][codon_info_2[position]];
1017                                    break;
1018                                }
1019                            }
1020                        }
1021
1022                        pes = 0;
1023                        pns = 0;
1024                        for (path_step = 0; path_step < path_length; path_step += 1) {
1025                            pes += SS[codon_sequence[path_step]];
1026                            pns += NS[codon_sequence[path_step]];
1027                        }
1028                        eps += pes / path_length;
1029                        epn += pns / path_length;
1030                        ops += ps;
1031                        opn += pn;
1032                    }
1033                }
1034
1035                if (path_count > 0) {
1036                    EPS[direct_index_1][direct_index_2] = eps / path_count;
1037                    EPN[direct_index_1][direct_index_2] = epn / path_count;
1038                    OPS[direct_index_1][direct_index_2] = ops / path_count;
1039                    OPN[direct_index_1][direct_index_2] = opn / path_count;
1040                    if (None != ntp) {
1041                        NTP[direct_index_1][direct_index_2] = ntp;
1042                    }
1043                }
1044
1045            }
1046        }
1047
1048        matrix.Symmetrize(EPS);
1049        matrix.Symmetrize(EPN);
1050        matrix.Symmetrize(OPS);
1051        matrix.Symmetrize(OPN);
1052        matrix.Symmetrize(NTP);
1053
1054        SS_sense = {senseCodonCount, 1};
1055        NS_sense = {senseCodonCount, 1};
1056
1057        codon_offset_1 = 0;
1058        for (codon_1 = 0; codon_1 < 64; codon_1 += 1) {
1059            if (genCode[codon_1] == stop_code) {
1060                codon_offset_1 += 1;
1061            }
1062            SS_sense [codon_1 - codon_offset_1] = SS[codon_1];
1063            NS_sense [codon_1 - codon_offset_1] = NS[codon_1];
1064        }
1065
1066        return {"EPS" : EPS, "EPN": EPN, "OPS" : OPS, "OPN" : OPN, "NTP" : NTP, "SS" : SS_sense, "NS": NS_sense};
1067}
1068