1RequireVersion("2.3.0"); 2 3LoadFunctionLibrary("libv3/convenience/matrix.bf"); 4LoadFunctionLibrary("libv3/UtilityFunctions.bf"); 5 6/* define various genetic code translation tables 7 8 Table definitions used here can be found on the NCBI web page at 9 https://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi 10 11 here's how HyPhy codes translate to aminoacids 12 13 0 == Phe 14 1 == Leu 15 2 == Ile 16 3 == Met 17 4 == Val 18 5 == Ser 19 6 == Pro 20 7 == Thr 21 8 == Ala 22 9 == Tyr 23 10 == Stop 24 11 == His 25 12 == Gln 26 13 == Asn 27 14 == Lys 28 15 == Asp 29 16 == Glu 30 17 == Cys 31 18 == Trp 32 19 == Arg 33 20 == Gly 34 35 AAA,AAC,AAG....TTA,TTC,TTG,TTT - 64 all in all*/ 36 37 38/* defines model states which are not allowed, i.e. termination codons. 39 GeneticCodeExclusions string is used by DataSetFilter to 40 eliminate "illegal" states from the data */ 41 42 43 44_geneticCodeOptionMatrix = { 45 { 46 "Universal", "Universal code. (Genebank transl_table=1)." 47 } { 48 "Vertebrate-mtDNA", "Vertebrate mitochondrial DNA code. (Genebank transl_table=2)." 49 } { 50 "Yeast-mtDNA", "Yeast mitochondrial DNA code. (Genebank transl_table=3)." 51 } { 52 "Mold-Protozoan-mtDNA", "Mold, Protozoan and Coelenterate mitochondrial DNA and the Mycloplasma/Spiroplasma code. (Genebank transl_table=4)." 53 } { 54 "Invertebrate-mtDNA", "Invertebrate mitochondrial DNA code. (Genebank transl_table=5)." 55 } { 56 "Ciliate-Nuclear", "Ciliate, Dasycladacean and Hexamita Nuclear code. (Genebank transl_table=6)." 57 } { 58 "Echinoderm-mtDNA", "Echinoderm mitochondrial DNA code. (Genebank transl_table=9)." 59 } { 60 "Euplotid-Nuclear", "Euplotid Nuclear code. (Genebank transl_table=10)." 61 } { 62 "Alt-Yeast-Nuclear", "Alternative Yeast Nuclear code. (Genebank transl_table=12)." 63 } { 64 "Ascidian-mtDNA", "Ascidian mitochondrial DNA code. (Genebank transl_table=13)." 65 } { 66 "Flatworm-mtDNA", "Flatworm mitochondrial DNA code. (Genebank transl_table=14)." 67 } { 68 "Blepharisma-Nuclear", "Blepharisma Nuclear code. (Genebank transl_table=15)." 69 } { 70 "Chlorophycean-mtDNA", "Chlorophycean Mitochondrial Code (transl_table=16)." 71 } { 72 "Trematode-mtDNA", "Trematode Mitochondrial Code (transl_table=21)." 73 } { 74 "Scenedesmus-obliquus-mtDNA", "Scenedesmus obliquus mitochondrial Code (transl_table=22)." 75 } { 76 "Thraustochytrium-mtDNA", "Thraustochytrium Mitochondrial Code (transl_table=23)." 77 } { 78 "Pterobranchia-mtDNA", "Pterobranchia Mitochondrial Code (transl_table=24)." 79 } { 80 "SR1-and-Gracilibacteria", "Candidate Division SR1 and Gracilibacteria Code (transl_table=25)." 81 } { 82 "Pachysolen-Nuclear", "Pachysolen tannophilus Nuclear Code (transl_table=26)." 83 }{ 84 "Mesodinium-Nuclear", "Mesodinium Nuclear Code (transl_table=29)" 85 }{ 86 "Peritrich-Nuclear", "Peritrich Nuclear Code (transl_table=30)" 87 }{ 88 "Cephalodiscidae-mtDNA", "Cephalodiscidae Mitochondrial UAA-Tyr Code (transl_table=33)" 89 } 90 91}; 92 93_hyphyAAOrdering = "FLIMVSPTAYXHQNKDECWRG"; 94_alphabeticalAAOrdering = "ACDEFGHIKLMNPQRSTVWY"; 95 96_singleAALetterToFullName = { 97 "A": "Alanine", 98 "C": "Cysteine", 99 "D": "Aspartic Acid", 100 "E": "Glutamic Acid", 101 "F": "Phenylalanine", 102 "G": "Glycine", 103 "H": "Histidine", 104 "I": "Isoleucine", 105 "K": "Lysine", 106 "L": "Leucine", 107 "M": "Methionine", 108 "N": "Aspargine", 109 "P": "Proline", 110 "Q": "Glutamine", 111 "R": "Arginine", 112 "S": "Serine", 113 "T": "Theronine", 114 "V": "Valine", 115 "W": "Tryptophan", 116 "Y": "Tyrosine", 117 "X": "Stop Codon" 118}; 119 120if (!skipCodeSelectionStep) { 121 122 ChoiceList(modelType, "Choose Genetic Code", 1, SKIP_NONE, _geneticCodeOptionMatrix); 123 124 if (modelType < 0) { 125 return; 126 } 127 128 ApplyGeneticCodeTable(modelType); 129} 130 131genetic_code.stop_code = 10; 132 133/*----------------------------------------------------------------------------------------------------------*/ 134 135MapCodonIndex.lookup = {"A" : 0, "C" : 1, "G" : 2, "T" : 3}; 136 137lfunction MapCodonIndex(codon_string) { 138 codon_string = codon_string && 1; 139 codon = 0; 140 for (k = 0; k < 3; k+=1) { 141 codon += (^"MapCodonIndex.lookup")[codon_string[k]] * (4^(2-k)); 142 } 143 return (codon+0.5)$1; 144} 145 146 147/*----------------------------------------------------------------------------------------------------------*/ 148 149function CountSenseCodons(code) { 150 return +code["_MATRIX_ELEMENT_VALUE_!=genetic_code.stop_code"]; 151} 152 153/*----------------------------------------------------------------------------------------------------------*/ 154 155 156function ApplyGeneticCodeTable(myModelType) { 157 _Genetic_Code = { 158 { 159 14, /*AAA*/ 13, /*AAC*/ 14, /*AAG*/ 13, /*AAT*/ 160 7, /*ACA*/ 7, /*ACC*/ 7, /*ACG*/ 7, /*ACT*/ 161 19, /*AGA*/ 5, /*AGC*/ 19, /*AGG*/ 5, /*AGT*/ 162 2, /*ATA*/ 2, /*ATC*/ 3, /*ATG*/ 2, /*ATT*/ 163 12, /*CAA*/ 11, /*CAC*/ 12, /*CAG*/ 11, /*CAT*/ 164 6, /*CCA*/ 6, /*CCC*/ 6, /*CCG*/ 6, /*CCT*/ 165 19, /*CGA*/ 19, /*CGC*/ 19, /*CGG*/ 19, /*CGT*/ 166 1, /*CTA*/ 1, /*CTG*/ 1, /*CTC*/ 1, /*CTT*/ 167 16, /*GAA*/ 15, /*GAC*/ 16, /*GAG*/ 15, /*GAT*/ 168 8, /*GCA*/ 8, /*GCC*/ 8, /*GCG*/ 8, /*GCT*/ 169 20, /*GGA*/ 20, /*GGC*/ 20, /*GGG*/ 20, /*GGT*/ 170 4, /*GTA*/ 4, /*GTC*/ 4, /*GTG*/ 4, /*GTT*/ 171 10, /*TAA*/ 9, /*TAC*/ 10, /*TAG*/ 9, /*TAT*/ 172 5, /*TCA*/ 5, /*TCC*/ 5, /*TCG*/ 5, /*TCT*/ 173 10, /*TGA*/ 17, /*TGC*/ 18, /*TGG*/ 17, /*TGT*/ 174 1, /*TTA*/ 0, /*TTC*/ 1, /*TTG*/ 0 /*TTT*/ 175 } 176 }; 177 178 179 GeneticCodeExclusions = "TAA,TAG,TGA"; 180 181 if (myModelType == 1) 182 /* Vertebrate mtDNA */ 183 { 184 _Genetic_Code[8] = 10; /* AGA => stop */ 185 _Genetic_Code[10] = 10; /* AGG => stop */ 186 _Genetic_Code[12] = 3; /* ATA => Met */ 187 _Genetic_Code[56] = 18; /* TGA => Trp */ 188 189 GeneticCodeExclusions = "AGA,AGG,TAA,TAG"; 190 return myModelType; 191 } 192 193 if (myModelType == 2) 194 /* Yeast mtDNA */ 195 { 196 _Genetic_Code[12] = 3; /* ATA => Met */ 197 _Genetic_Code[28] = 7; /* CTA => Thr */ 198 _Genetic_Code[29] = 7; /* CTC => Thr */ 199 _Genetic_Code[30] = 7; /* CTG => Thr */ 200 _Genetic_Code[31] = 7; /* CTT => Thr */ 201 _Genetic_Code[56] = 18; /* TGA => Trp */ 202 203 GeneticCodeExclusions = "TAA,TAG"; 204 return myModelType; 205 } 206 207 if (myModelType == 3) 208 /* Mold,Protozoan and Coelenterate mtDNA */ 209 { 210 _Genetic_Code[56] = 18; /* TGA => Trp */ 211 GeneticCodeExclusions = "TAA,TAG"; 212 return myModelType; 213 } 214 215 if (myModelType == 4) 216 /* Invertebrate mtDNA */ 217 { 218 _Genetic_Code[8] = 5; /* AGA => Ser */ 219 _Genetic_Code[10] = 5; /* AGG => Ser */ 220 _Genetic_Code[12] = 3; /* ATA => Met */ 221 _Genetic_Code[56] = 18; /* TGA => Trp */ 222 223 GeneticCodeExclusions = "TAA,TAG"; 224 return myModelType; 225 } 226 227 if (myModelType == 5) 228 /* Ciliate Nuclear Code */ 229 { 230 _Genetic_Code[48] = 12; /* TAA => Gln */ 231 _Genetic_Code[50] = 12; /* TAG => Gln */ 232 233 GeneticCodeExclusions = "TGA"; 234 return myModelType; 235 } 236 237 if (myModelType == 6) 238 /* Echinoderm mtDNA */ 239 { 240 _Genetic_Code[0] = 13; /* AAA => Asn */ 241 _Genetic_Code[8] = 5; /* AGA => Ser */ 242 _Genetic_Code[10] = 5; /* AGG => Ser */ 243 _Genetic_Code[56] = 18; /* TGA => Trp */ 244 245 GeneticCodeExclusions = "TAA,TAG"; 246 return myModelType; 247 } 248 249 if (myModelType == 7) 250 /* Euplotid mtDNA */ 251 { 252 _Genetic_Code[56] = 17; /* TGA => Cys */ 253 254 GeneticCodeExclusions = "TAA,TAG"; 255 return myModelType; 256 } 257 258 if (myModelType == 8) 259 /* Alternative Yeast Nuclear */ 260 { 261 _Genetic_Code[30] = 5; /* CTG => Ser */ 262 263 GeneticCodeExclusions = "TAA,TAG,TGA"; 264 return myModelType; 265 } 266 267 if (myModelType == 9) 268 /* Ascidian mtDNA */ 269 { 270 _Genetic_Code[8] = 20; /* AGA => Gly */ 271 _Genetic_Code[10] = 20; /* AGG => Gly */ 272 _Genetic_Code[12] = 3; /* AGG => Met */ 273 _Genetic_Code[56] = 18; /* TGA => Trp */ 274 275 GeneticCodeExclusions = "TAA,TAG"; 276 return myModelType; 277 } 278 279 if (myModelType == 10) 280 /* Flatworm mtDNA */ 281 { 282 _Genetic_Code[0] = 13; /* AAA => Asn */ 283 _Genetic_Code[8] = 5; /* AGA => Ser */ 284 _Genetic_Code[10] = 5; /* AGG => Ser */ 285 _Genetic_Code[48] = 9; /* TAA => Tyr */ 286 _Genetic_Code[56] = 18; /* TGA => Trp */ 287 288 GeneticCodeExclusions = "TAG"; 289 return myModelType; 290 } 291 292 if (myModelType == 11) 293 /* Blepharisma Nuclear */ 294 { 295 _Genetic_Code[50] = 12; /* TAG => Gln */ 296 297 GeneticCodeExclusions = "TAA,TGA"; 298 return myModelType; 299 } 300 301 302 if (myModelType == 12) 303 /* Chlorophycean Mitochondrial Code */ 304 { 305 _Genetic_Code[50] = 1; /* TAG => Leu */ 306 307 GeneticCodeExclusions = "TAA,TGA"; 308 return myModelType; 309 } 310 311 if (myModelType == 13) 312 /* Trematode Mitochondrial Code */ 313 { 314 _Genetic_Code[56] = 18; /* TGA => Trp */ 315 _Genetic_Code[12] = 3; /* ATA => Met */ 316 _Genetic_Code[8] = 5; /* AGA => Ser */ 317 _Genetic_Code[10] = 5; /* AGG => Trp */ 318 _Genetic_Code[0] = 13; /* AAA => Asn */ 319 320 GeneticCodeExclusions = "TAA,TAG"; 321 return myModelType; 322 } 323 324 if (myModelType == 14) 325 /* Scenedesmus obliquus mitochondrial Code */ 326 { 327 _Genetic_Code[52] = 10; /* TCA => Stop */ 328 _Genetic_Code[50] = 1; /* TAG => Leu */ 329 330 GeneticCodeExclusions = "TAA,TCA,TGA"; 331 return myModelType; 332 } 333 334 if (myModelType == 15) 335 /* Thraustochytrium mtDNA */ 336 { 337 _Genetic_Code[60] = 10; /* TTA => Stop */ 338 339 GeneticCodeExclusions = "TAA,TAG,TGA,TTA"; 340 return myModelType; 341 } 342 343 if (myModelType == 16) 344 /* Pterobranchia mtDNA */ 345 { 346 _Genetic_Code[56] = 18; /* TGA => Trp */ 347 _Genetic_Code[8] = 5; /* AGA => Serine */ 348 _Genetic_Code[10] = 14; /* AGG => Lysine */ 349 350 GeneticCodeExclusions = "TAA,TAG"; 351 return myModelType; 352 } 353 354 if (myModelType == 17) 355 /* Candidate Division SR1 */ 356 { 357 _Genetic_Code[56] = 20; /* TGA => Gly */ 358 359 GeneticCodeExclusions = "TAA,TAG"; 360 return myModelType; 361 } 362 363 if (myModelType == 18) 364 /* Pachysolen tannophilus nuclear */ 365 { 366 _Genetic_Code[30] = 8; /* CTG => Ala */ 367 368 GeneticCodeExclusions = "TAA,TAG,TGA"; 369 return myModelType; 370 } 371 372 if (myModelType == 19) 373 /* Mesodinium Nuclear */ 374 { 375 _Genetic_Code[48] = 9; /* TAA => Tyr */ 376 _Genetic_Code[50] = 9; /* TAG => Tyr */ 377 378 GeneticCodeExclusions = "TGA"; 379 return myModelType; 380 } 381 382 if (myModelType == 20) 383 /* Peritrich Nuclear */ 384 { 385 _Genetic_Code[48] = 16; /* TAA => Glu */ 386 _Genetic_Code[50] = 16; /* TAG => Glu */ 387 388 GeneticCodeExclusions = "TGA"; 389 return myModelType; 390 } 391 392 if (myModelType == 21) 393 /* Cephalodiscidae Mitochondrial */ 394 { 395 _Genetic_Code[48] = 9; /* TAA => Tyr */ 396 _Genetic_Code[56] = 18; /* TGA => Trp */ 397 _Genetic_Code[8] = 5; /* AGA => Ser */ 398 _Genetic_Code[10] = 14; /* AGG => Lys */ 399 400 GeneticCodeExclusions = "TAG"; 401 return myModelType; 402 } 403 404 405 return myModelType; 406} 407 408/*----------------------------------------------------------------------------------------------------------*/ 409 410function mapCodonsToAAGivenMappingAux(codonSeq, aaSequence, mapping, this_many_mm) { 411 seqLen = Abs(aaSequence); 412 translString = ""; 413 translString * (seqLen); 414 seqLenN = Abs(codonSeq); 415 416 aaPos = 0; 417 seqPos = 0; 418 codon = codonSeq[seqPos][seqPos + 2]; 419 currentAA = mapping[codon]; 420 mismatch_count = 0; 421 422 for (aaPos = 0; aaPos < seqLen && seqPos < seqLenN; aaPos += 1) { 423 advance = 1; 424 copy_codon = 1; 425 426 if (currentAA != 0) { 427 if (aaSequence[aaPos] == "-") { 428 if (currentAA != "X") { 429 translString * "---"; 430 advance = 0; 431 } 432 } else { 433 mismatch_count += (aaSequence[aaPos] != currentAA); 434 if (this_many_mm == 1) { 435 assert(mismatch_count < this_many_mm, "A mismatch between codon and protein sequences at position " + aaPos + " (codon `seqPos`) : codon '" + codonSeq[seqPos][seqPos + 2] + "'(`currentAA`) a.a. '`aaSequence[aaPos]`'"); 436 } else { 437 if (mismatch_count >= this_many_mm) { 438 translString * 0; 439 return None; 440 } 441 } 442 } 443 } else { 444 copy_codon = 0; 445 } 446 447 if (advance) { 448 if (copy_codon) { 449 if (currentAA == "X") { 450 translString * "---"; 451 } else { 452 translString * codon; 453 } 454 } else { 455 //fprintf (stdout, "Skipping codon ", codon, "\n"); 456 aaPos = aaPos - 1; 457 } 458 seqPos += 3; 459 codon = codonSeq[seqPos][seqPos + 2]; 460 currentAA = mapping[codon]; 461 } 462 } 463 464 465 translString * 0; 466 467 return translString; 468} 469 470/*----------------------------------------------------------------------------------------------------------*/ 471 472function mapCodonsToAAGivenMapping(codonSeq, aaSequence, mapping) { 473 return mapCodonsToAAGivenMappingAux(codonSeq, aaSequence, mapping, 1); 474} 475 476 477/*----------------------------------------------------------------------------------------------------------*/ 478 479function mapCodonsToAA(codonSeq, aaSequence) { 480 return mapCodonsToAAGivenMapping(codonSeq, aaSequence, defineCodonToAA()); 481} 482 483/*----------------------------------------------------------------------------------------------------------*/ 484 485function mapCodonsToAAFuzzy(codonSeq, aaSequence, mismatches) { 486 return mapCodonsToAAGivenMappingAux(codonSeq, aaSequence, defineCodonToAA(), mismatches); 487} 488 489 490/*----------------------------------------------------------------------------------------------------------*/ 491 492lfunction CompareCodonProperties(codon1, codon2, code) 493 /* given: 494 codon1 (a number between 0 and 63 in AAA...TTT encoding), 495 codon2 (same encoding), 496 code (the genetic code) 497 498 returns a dictionary with the following keys: 499 500 "NONSYNONYMOUS" : [BOOLEAN] set to 1 if codon1 <-> codon2 is a non-synynomous substitution, otherwise 0 501 "DIFFERENCES" : [INTEGER 0,1,2,3] set to the number of nucleotide differences 502 "BY_POSITION" : [BOOLEAN MATRIX] a 1x3 matrix, where the i-th entry is 1 if the corresponding nucleotide position is different between the codons 503 "1" : [1x2 MATRIX] nucleotide substitution in position 1 (from -> to) encoded as an index into "ACGT" 504 for example, codon1 = TCT, codon 2 = GCT, this matrix will be {{3,2}} 505 506 "2" : ... same for the second position 507 "3" : ... same for the third position 508 */ 509 510{ 511 _codonCompResult = {}; 512 513 _codonCompResult["NONSYNONYMOUS"] = (code[codon1] != code[codon2]); 514 _codonCompResult["BY_POSITION"] = { 515 1, 2 516 }; 517 518 _positionMatrix = { 519 { 520 codon1 % 4, codon2 % 4 521 } 522 }; 523 524 for (_ci = 0; _ci < 3; _ci += 1) { 525 526 _codonCompResult[1 + _ci] = Eval(_positionMatrix); 527 (_codonCompResult["BY_POSITION"])[_ci] = (_positionMatrix[0] != _positionMatrix[1]); 528 529 codon1 = codon1 $ 4; 530 codon2 = codon2 $ 4; 531 } 532 533 _codonCompResult["DIFFERENCES"] = (_codonCompResult["BY_POSITION"])[0] + (_codonCompResult["BY_POSITION"])[1] + (_codonCompResult["BY_POSITION"])[2]; 534 535 return _codonCompResult; 536} 537 538 539 540/*----------------------------------------------------------------------------------------------------------*/ 541 542function defineCodonToAA() { 543 return defineCodonToAAGivenCode(_Genetic_Code); 544} 545 546/*----------------------------------------------------------------------------------------------------------*/ 547 548function defineCodonToAAGivenCode(code) { 549 codonToAAMap = {}; 550 nucChars = "ACGT"; 551 552 for (p1 = 0; p1 < 64; p1 += 1) { 553 codonToAAMap[nucChars[p1$16] + nucChars[p1 % 16 $4] + nucChars[p1 % 4]] = _hyphyAAOrdering[code[p1]]; 554 } 555 556 return codonToAAMap; 557} 558 559/*----------------------------------------------------------------------------------------------------------*/ 560 561function findAllCodonsForAA(aa) { 562 codonsForAA = {}; 563 564 for (p1 = 0; p1 < 64; p1 = p1 + 1) { 565 if (_hyphyAAOrdering[_Genetic_Code[p1]] == aa) { 566 codonsForAA[p1] = 1; 567 } 568 } 569 570 return codonsForAA; 571} 572 573/*----------------------------------------------------------------------------------------------------------*/ 574 575function RawToSense(code) 576/* 577 given: 578 genetic code, 579 580 returns a 64x1 matrix mapping raw codons to sense codons only (stops are mapped to -1) 581*/ 582{ 583 _codonMap = { 584 64, 1 585 }; 586 587 _cShift = 0; 588 for (_ci = 0; _ci < 64; _ci = _ci + 1) { 589 if (code[_ci] == genetic_code.stop_code) { 590 _cShift = _cShift + 1; 591 _codonMap[_ci] = -1; 592 } else { 593 _codonMap[_ci] = _ci - _cShift; 594 } 595 } 596 597 return _codonMap; 598} 599 600 601/*----------------------------------------------------------------------------------------------------------*/ 602 603function IsTransition(pair) 604/* 605 given: 606 a pair of nucleotides (as a 1x2 matrix, e.g. as returned by CompareCodonProperties["1"]), 607 608 returns 1 if the substitution is a transition 609 returns -1 if the substitution is a transversion 610 611 RETURNS 0 IF NO SUBSTITUTION TOOK PLACE 612*/ 613{ 614 if (pair[0] != pair[1]) { 615 if (Abs(pair[0] - pair[1]) % 2 == 0) { 616 return 1; 617 } 618 return -1; 619 } 620 return 0; 621} 622 623/*----------------------------------------------------------------------------------------------------------*/ 624 625lfunction IsStop(codon, code) 626 627/* 628 given: 629 codon (a number between 0 and 63 in AAA...TTT encoding) 630 code (the genetic code) 631 632 returns 633 whether or not the codon is a stop codon 634*/ 635 636{ 637 return code[codon] == ^ "genetic_code.stop_code"; 638} 639 640/*----------------------------------------------------------------------------------------------------------*/ 641 642function translateCodonToAA(codonSeq, mapping, offset) { 643 seqLen = Abs(codonSeq); 644 translString = ""; 645 translString * (seqLen / 3 + 1); 646 for (seqPos = offset; seqPos < seqLen; seqPos += 3) { 647 codon = codonSeq[seqPos][seqPos + 2]; 648 prot = mapping[codon]; 649 if (Abs(prot)) { 650 translString * prot; 651 } else { 652 if (codon == "---") { 653 translString * "-"; 654 } else { 655 translString * "?"; 656 } 657 } 658 } 659 translString * 0; 660 translString = translString ^ { 661 { 662 "X$", "?" 663 } 664 }; 665 666 return translString; 667} 668 669 670 671/*----------------------------------------------------------------------------------------------------------*/ 672 673lfunction ComputeCodonCodeToStringMap(genCode) { 674 _codonMap = {}; 675 _nucLetters = "ACGT"; 676 for (_idx = 0; _idx < Columns(genCode); _idx += 1) { 677 if (genCode[_idx] != ^ "genetic_code.stop_code") { 678 _codonMap + (_nucLetters[_idx$16] + _nucLetters[(_idx % 16) $4] + _nucLetters[_idx % 4]); 679 } 680 } 681 return _codonMap; 682} 683 684/*----------------------------------------------------------------------------------------------------------*/ 685 686lfunction ComputeCodonCodeToStringMapStop (genCode) { 687 _codonMap = {}; 688 _nucLetters = "ACGT"; 689 for (_idx = 0; _idx < Columns(genCode); _idx += 1) { 690 if (genCode[_idx] == ^ "genetic_code.stop_code") { 691 _codonMap + (_nucLetters[_idx$16] + _nucLetters[(_idx%16)$4] + _nucLetters[_idx%4]); 692 } 693 } 694 return _codonMap; 695} 696 697/*----------------------------------------------------------------------------------------------------------*/ 698 699lfunction genetic_code.partition_codon(codon) { 700 return Eval({ 701 { 702 codon$16, (codon % 16) $4, codon % 4 703 } 704 }); 705} 706 707lfunction genetic_code.assemble_codon(positions) { 708 return positions[0] * 16 + positions[1] * 4 + positions[2]; 709} 710 711/*----------------------------------------------------------------------------------------------------------*/ 712 713lfunction genetic_code.ComputeBranchLengthStencils(genCode) { 714 /* 715 given a genetic code (`genCode`), computes a matrix of N x N entries (N = sense codons) 716 where a value of 1 in cell (i,j) means that i <-> j is a substitution of a specific type 717 (e.g. synonymous or non-synonymous), and a value of 0 is assigned to all other cells. 718 719 returns a dictionary with 'synonymous' and 'non-synonymous' keys 720 721 Also see inline comments 722 723 */ 724 725 sense_codons = CountSenseCodons (genCode); 726 SS = {sense_codons, sense_codons}; 727 NS = {sense_codons, sense_codons}; 728 729 stop_code = ^ "genetic_code.stop_code"; 730 codon_offset = 0; 731 732 733 for (codon = 0; codon < 64; codon += 1) { 734 if (genCode[codon] == stop_code) { 735 codon_offset += 1; 736 } else { 737 aa1 = genCode [codon]; 738 codon_offset2 = codon_offset; 739 for (codon2 = codon + 1; codon2 < 64; codon2 += 1) { 740 if (genCode [codon2] == stop_code) { 741 codon_offset2 += 1; 742 } else { 743 if (aa1 == genCode [codon2]) { 744 SS [codon-codon_offset][codon2-codon_offset2] = 1; 745 } else { 746 NS [codon-codon_offset][codon2-codon_offset2] = 1; 747 } 748 } 749 } 750 } 751 } 752 753 matrix.Symmetrize(SS); 754 matrix.Symmetrize(NS); 755 756 return {"synonymous" : SS, "non-synonymous" : NS}; 757 758} 759 760/*----------------------------------------------------------------------------------------------------------*/ 761 762lfunction genetic_code.DefineCodonToAAMapping (code) { 763 codonToAAMap = {}; 764 nucChars = "ACGT"; 765 766 for (p = 0; p < 64; p += 1) { 767 codonToAAMap[nucChars[p$16] + nucChars[p % 16 $4] + nucChars[p % 4]] = (^"_hyphyAAOrdering")[code[p]]; 768 } 769 770 return codonToAAMap; 771} 772 773 774/*----------------------------------------------------------------------------------------------------------*/ 775 776lfunction genetic_code.DefineIntegerToAAMapping (code, only_sense) { 777 codon_code_map = {}; 778 779 shift = 0; 780 781 for (p = 0; p < 64; p += 1) { 782 if (IsStop (p, code)) { 783 if (only_sense) { 784 shift += 1; 785 continue; 786 } 787 } 788 789 codon_code_map[p-shift] = (^"_hyphyAAOrdering")[code[p]]; 790 } 791 792 return codon_code_map; 793} 794 795/*----------------------------------------------------------------------------------------------------------*/ 796 797lfunction genetic_code.ComputePairwiseDifferencesAndExpectedSites(genCode, options) { 798 /* 799 given a genetic code (`genCode`), computes a number of per-codon (or per pair of codons) 800 quantities that relate to numbers of synonymous and non-synonymous sites or 801 substitutions. 802 803 `options` can be null, or have any of the following keys: 804 805 `weighting-matrix` is expected to be a set of 3 4x4 matrices showing relative frequencies of 806 various nucleotide->nucleotide substitutions stratified by codon position; by default they 807 are all equal 808 809 `count-stop-codons` treat mutations to stop codons as non-synonymous changes for counting purposes 810 (by the default they are not counted at all) 811 812 Also see inline comments 813 814 */ 815 816 SS = { 817 64, 1 818 }; // raw codon index -> # of synonymous sites [0-3] 819 NS = SS; // raw codon index -> # of non-synonymous sites [0-3] 820 821 stop_code = ^ "genetic_code.stop_code"; 822 823 if (Type(options["weighting-matrix"]) == "AssociativeList") { 824 weighting_matrix = options["weighting-matrix"]; 825 } else { 826 equal = { 827 4, 4 828 }["1"]; 829 weighting_matrix = {}; 830 weighting_matrix + equal; 831 weighting_matrix + equal; 832 weighting_matrix + equal; 833 } 834 835 keep_stop_codons = FALSE; 836 837 if (Type(options["count-stop-codons"]) == "Number") { 838 keep_stop_codons = options["count-stop-codons"]; 839 } 840 841 codon_offset = 0; 842 843 for (codon = 0; codon < 64; codon += 1) { 844 845 if (genCode[codon] == stop_code) { 846 codon_offset += 1; 847 } else { 848 849 codon_info = genetic_code.partition_codon(codon); 850 aa = genCode[codon]; 851 852 for (codon_position = 0; codon_position < 3; codon_position += 1) { 853 norm_factor = 0.0; 854 sSites = 0.0; 855 nsSites = 0.0; 856 // mutagenize 'codon' at 'codon_position' 857 858 copy_codon = codon_info; 859 for (new_nuc = 0; new_nuc < 4; new_nuc += 1) { 860 if (new_nuc != codon_info[codon_position]) { 861 copy_codon[codon_position] = new_nuc; 862 new_codon = genetic_code.assemble_codon(copy_codon); 863 w = (weighting_matrix[codon_position])[codon_info[codon_position]][new_nuc]; 864 if (keep_stop_codons || stop_code == genCode[new_codon] == 0) { 865 if (genCode[new_codon] != aa) { 866 nsSites += w; 867 } else { 868 sSites += w; 869 } 870 } 871 norm_factor += w; 872 } 873 } 874 875 if (norm_factor > 0) { 876 SS[codon] += sSites / norm_factor; 877 NS[codon] += nsSites / norm_factor; 878 } 879 880 } 881 } 882 } 883 884 senseCodonCount = 64 - codon_offset; 885 886 EPS = { 887 senseCodonCount, senseCodonCount 888 }; 889 EPN = EPS; 890 OPS = EPS; 891 OPN = EPS; 892 NTP = EPS["-1"]; 893 894 empty_dict = {}; 895 896 codon_offset_1 = 0; 897 898 all_permutations = { 899 "0": { 900 { 901 0, 1, 2 902 } 903 }, 904 "1": { 905 { 906 0, 2, 1 907 } 908 }, 909 "2": { 910 { 911 1, 0, 2 912 } 913 }, 914 "3": { 915 { 916 1, 2, 0 917 } 918 }, 919 "4": { 920 { 921 2, 0, 1 922 } 923 }, 924 "5": { 925 { 926 2, 1, 0 927 } 928 } 929 }; 930 931 ntp_matrix = { 932 { 933 0, 0, 1, 2 934 } { 935 0, 0, 3, 4 936 } { 937 0, 0, 0, 5 938 } { 939 0, 0, 0, 0 940 } 941 }; 942 943 matrix.Symmetrize(ntp_matrix); 944 945 946 for (codon_1 = 0; codon_1 < 64; codon_1 += 1) { 947 if (genCode[codon_1] == stop_code) { 948 codon_offset_1 += 1; 949 continue; 950 } 951 952 codon_info_1 = genetic_code.partition_codon(codon_1); 953 aa_1 = genCode[codon_1]; 954 direct_index_1 = codon_1 - codon_offset_1; 955 956 EPS[direct_index_1][direct_index_1] = SS[codon_1]; 957 EPN[direct_index_1][direct_index_1] = NS[codon_1]; 958 959 codon_offset_2 = codon_offset_1; 960 961 962 for (codon_2 = codon_1 + 1; codon_2 < 64; codon_2 += 1) { 963 if (genCode[codon_2] == stop_code) { 964 codon_offset_2 += 1; 965 continue; 966 } 967 968 969 codon_info_2 = genetic_code.partition_codon(codon_2); 970 aa_2 = genCode[codon_2]; 971 direct_index_2 = codon_2 - codon_offset_2; 972 973 974 path_count = 0; 975 eps = 0; 976 epn = 0; 977 ops = 0; 978 opn = 0; 979 ntp = None; 980 981 for (path = 0; path < 6; path += 1) { 982 current_codon = codon_info_1; 983 current_aa = aa_1; 984 codon_sequence = empty_dict; 985 codon_sequence + codon_1; 986 987 ps = 0; 988 pn = 0; 989 990 for (path_step = 0; path_step < 3; path_step += 1) { 991 change_index = (all_permutations[path])[path_step]; 992 if (current_codon[change_index] != codon_info_2[change_index]) { 993 current_codon[change_index] = codon_info_2[change_index]; 994 current_codon_index = genetic_code.assemble_codon(current_codon); 995 next_aa = genCode[current_codon_index]; 996 if (next_aa == stop_code) { 997 break; 998 } 999 codon_sequence + current_codon_index; 1000 if (current_aa == next_aa) { 1001 ps += 1; 1002 } else { 1003 pn += 1; 1004 } 1005 current_aa = next_aa; 1006 } 1007 } 1008 1009 if (path_step == 3) { 1010 path_count += 1; 1011 path_length = Abs(codon_sequence); 1012 1013 if (path_length == 2 && ntp == None) { 1014 for (position = 0; position < 3; position += 1) { 1015 if (codon_info_1[position] != codon_info_2[position]) { 1016 ntp = ntp_matrix[codon_info_1[position]][codon_info_2[position]]; 1017 break; 1018 } 1019 } 1020 } 1021 1022 pes = 0; 1023 pns = 0; 1024 for (path_step = 0; path_step < path_length; path_step += 1) { 1025 pes += SS[codon_sequence[path_step]]; 1026 pns += NS[codon_sequence[path_step]]; 1027 } 1028 eps += pes / path_length; 1029 epn += pns / path_length; 1030 ops += ps; 1031 opn += pn; 1032 } 1033 } 1034 1035 if (path_count > 0) { 1036 EPS[direct_index_1][direct_index_2] = eps / path_count; 1037 EPN[direct_index_1][direct_index_2] = epn / path_count; 1038 OPS[direct_index_1][direct_index_2] = ops / path_count; 1039 OPN[direct_index_1][direct_index_2] = opn / path_count; 1040 if (None != ntp) { 1041 NTP[direct_index_1][direct_index_2] = ntp; 1042 } 1043 } 1044 1045 } 1046 } 1047 1048 matrix.Symmetrize(EPS); 1049 matrix.Symmetrize(EPN); 1050 matrix.Symmetrize(OPS); 1051 matrix.Symmetrize(OPN); 1052 matrix.Symmetrize(NTP); 1053 1054 SS_sense = {senseCodonCount, 1}; 1055 NS_sense = {senseCodonCount, 1}; 1056 1057 codon_offset_1 = 0; 1058 for (codon_1 = 0; codon_1 < 64; codon_1 += 1) { 1059 if (genCode[codon_1] == stop_code) { 1060 codon_offset_1 += 1; 1061 } 1062 SS_sense [codon_1 - codon_offset_1] = SS[codon_1]; 1063 NS_sense [codon_1 - codon_offset_1] = NS[codon_1]; 1064 } 1065 1066 return {"EPS" : EPS, "EPN": EPN, "OPS" : OPS, "OPN" : OPN, "NTP" : NTP, "SS" : SS_sense, "NS": NS_sense}; 1067} 1068