1 // 2 // C++ Interface: alignment 3 // 4 // Description: 5 // 6 // 7 // Author: BUI Quang Minh, Steffen Klaere, Arndt von Haeseler <minh.bui@univie.ac.at>, (C) 2008 8 // 9 // Copyright: See COPYING file that comes with this distribution 10 // 11 // 12 #ifndef ALIGNMENT_H 13 #define ALIGNMENT_H 14 15 #include <vector> 16 #include <bitset> 17 #include "pattern.h" 18 #include "ncl/ncl.h" 19 20 const double MIN_FREQUENCY = 0.0001; 21 const double MIN_FREQUENCY_DIFF = 0.00001; 22 23 const int NUM_CHAR = 256; 24 typedef bitset<NUM_CHAR> StateBitset; 25 26 /** class storing results of symmetry tests */ 27 class SymTestResult { 28 public: SymTestResult()29 SymTestResult() { 30 significant_pairs = included_pairs = excluded_pairs = 0; 31 pvalue_binom = -1.0; 32 } 33 34 /** compute pvalue using bionomial test */ 35 void computePvalue(); 36 37 int significant_pairs; // number of significant sequence pairs 38 int included_pairs; // total number of included sequence pairs 39 int excluded_pairs; // number of excluded sequence pairs 40 double max_stat; // maximum of the pair statistics 41 double pvalue_binom; // pvalue of binomial test of symmetry 42 double pvalue_maxdiv; // p-value of the sequence pair with maximum divergence 43 double pvalue_perm; // p-value of permutation test of symmetry 44 }; 45 46 /** class storing all pairwise statistics */ 47 class SymTestStat { 48 public: SymTestStat()49 SymTestStat() { 50 part = 0; 51 seq1 = seq2 = 0; 52 chi2_sym = 0.0; 53 chi2_marsym = std::numeric_limits<double>::quiet_NaN(); 54 chi2_intsym = std::numeric_limits<double>::quiet_NaN(); 55 pval_sym = std::numeric_limits<double>::quiet_NaN(); 56 pval_marsym = std::numeric_limits<double>::quiet_NaN(); 57 pval_intsym = std::numeric_limits<double>::quiet_NaN(); 58 } 59 int part; // partition ID 60 int seq1, seq2; // ID of sequence 1 and 2 61 double chi2_sym; // chi2 statistic test of symmetry 62 double chi2_marsym; // chi2 statistic test of marginal symmetry 63 double chi2_intsym; // chi2 statistic test of internal symmetry 64 double pval_sym; // chi2 p-value test of symmetry 65 double pval_marsym; // chi2 p-value test of marginal symmetry 66 double pval_intsym; // chi2 p-value test of internal symmetry 67 }; 68 69 std::ostream& operator<< (std::ostream& stream, const SymTestResult& res); 70 71 #ifdef USE_HASH_MAP 72 struct hashPattern { operatorhashPattern73 size_t operator()(const vector<StateType> &sp) const { 74 size_t sum = 0; 75 for (Pattern::const_iterator it = sp.begin(); it != sp.end(); it++) 76 sum = (*it) + (sum << 6) + (sum << 16) - sum; 77 return sum; 78 } 79 }; 80 typedef unordered_map<vector<StateType>, int, hashPattern> PatternIntMap; 81 #else 82 typedef map<vector<StateType>, int> PatternIntMap; 83 #endif 84 85 86 constexpr int EXCLUDE_GAP = 1; // exclude gaps 87 constexpr int EXCLUDE_INVAR = 2; // exclude invariant sites 88 constexpr int EXCLUDE_UNINF = 4; // exclude uninformative sites 89 90 /** 91 Multiple Sequence Alignment. Stored by a vector of site-patterns 92 93 @author BUI Quang Minh, Steffen Klaere, Arndt von Haeseler <minh.bui@univie.ac.at> 94 */ 95 class Alignment : public vector<Pattern>, public CharSet, public StateSpace { 96 friend class SuperAlignment; 97 friend class SuperAlignmentUnlinked; 98 99 public: 100 101 /** 102 constructor 103 */ 104 Alignment(); 105 106 /** 107 constructor 108 @param filename file name 109 @param sequence_type type of the sequence, either "BIN", "DNA", "AA", or NULL 110 @param intype (OUT) input format of the file 111 */ 112 Alignment(char *filename, char *sequence_type, InputType &intype, string model); 113 114 /** 115 constructor 116 @param data_block nexus DATA block 117 @param sequence_type type of the sequence, either "BIN", "DNA", "AA", or NULL 118 */ 119 Alignment(NxsDataBlock *data_block, char *sequence_type, string model); 120 121 /** 122 destructor 123 */ 124 virtual ~Alignment(); 125 126 127 /**************************************************************************** 128 input alignment reader 129 ****************************************************************************/ 130 131 /** get the SeqType for a given string */ 132 static SeqType getSeqType(const char *sequence_type); 133 134 135 /** 136 add a pattern into the alignment 137 @param pat the pattern 138 @param site the site index of the pattern from the alignment 139 @param freq frequency of pattern 140 @return TRUE if pattern contains only gaps or unknown char. 141 In that case, the pattern won't be added. 142 */ 143 bool addPattern(Pattern &pat, int site, int freq = 1); 144 145 /** 146 determine if the pattern is constant. update the is_const variable. 147 */ 148 virtual void computeConst(Pattern &pat); 149 150 151 void printSiteInfoHeader(ostream& out, const char* filename, bool partition = false); 152 /** 153 Print all site information to a stream 154 @param out output stream 155 @param part_id partition ID, negative to omit 156 */ 157 void printSiteInfo(ostream &out, int part_id); 158 159 /** 160 Print all site information to a file 161 @param filename output file name 162 */ 163 virtual void printSiteInfo(const char* filename); 164 165 /** 166 * add const patterns into the alignment 167 * @param freq_const_pattern comma-separated list of const pattern frequencies 168 */ 169 void addConstPatterns(char *freq_const_patterns); 170 171 /** 172 read the alignment in NEXUS format 173 @param filename file name 174 @return 1 on success, 0 on failure 175 */ 176 int readNexus(char *filename); 177 178 int buildPattern(StrVector &sequences, char *sequence_type, int nseq, int nsite); 179 180 /** 181 read the alignment in PHYLIP format (interleaved) 182 @param filename file name 183 @param sequence_type type of the sequence, either "BIN", "DNA", "AA", or NULL 184 @return 1 on success, 0 on failure 185 */ 186 int readPhylip(char *filename, char *sequence_type); 187 188 /** 189 read the alignment in sequential PHYLIP format 190 @param filename file name 191 @param sequence_type type of the sequence, either "BIN", "DNA", "AA", or NULL 192 @return 1 on success, 0 on failure 193 */ 194 int readPhylipSequential(char *filename, char *sequence_type); 195 196 /** 197 read the alignment in FASTA format 198 @param filename file name 199 @param sequence_type type of the sequence, either "BIN", "DNA", "AA", or NULL 200 @return 1 on success, 0 on failure 201 */ 202 int readFasta(char *filename, char *sequence_type); 203 204 /** 205 * Read the alignment in counts format (PoMo). 206 * 207 * TODO: Allow noninformative sites (where no base is present). 208 * 209 * @param filename file name 210 * @param sequence_type sequence type (i.e., "CF10") 211 * 212 * @return 1 on success, 0 on failure 213 */ 214 int readCountsFormat(char *filename, char *sequence_type); 215 216 /** 217 read the alignment in CLUSTAL format 218 @param filename file name 219 @param sequence_type type of the sequence, either "BIN", "DNA", "AA", or NULL 220 @return 1 on success, 0 on failure 221 */ 222 int readClustal(char *filename, char *sequence_type); 223 224 /** 225 read the alignment in MSF format 226 @param filename file name 227 @param sequence_type type of the sequence, either "BIN", "DNA", "AA", or NULL 228 @return 1 on success, 0 on failure 229 */ 230 int readMSF(char *filename, char *sequence_type); 231 232 /** 233 extract the alignment from a nexus data block, called by readNexus() 234 @param data_block data block of nexus file 235 */ 236 void extractDataBlock(NxsCharactersBlock *data_block); 237 238 vector<Pattern> ordered_pattern; 239 240 /** lower bound of sum parsimony scores for remaining pattern in ordered_pattern */ 241 UINT *pars_lower_bound; 242 243 /** order pattern by number of character states and return in ptn_order 244 @param pat_type either PAT_INFORMATIVE or 0 245 */ 246 virtual void orderPatternByNumChars(int pat_type); 247 248 /** 249 * un-group site-patterns, i.e., making #sites = #patterns and pattern frequency = 1 for all patterns 250 */ 251 void ungroupSitePattern(); 252 253 254 /** 255 * re-group site-patterns 256 * @param groups number of groups 257 * @param site_group group ID (0, 1, ...ngroups-1; must be continuous) of all sites 258 */ 259 void regroupSitePattern(int groups, IntVector &site_group); 260 261 262 /**************************************************************************** 263 output alignment 264 ****************************************************************************/ 265 SeqType detectSequenceType(StrVector &sequences); 266 267 void computeUnknownState(); 268 269 void buildStateMap(char *map, SeqType seq_type); 270 271 virtual StateType convertState(char state, SeqType seq_type); 272 273 /** 274 * convert state if the number of states (num_states is known) 275 * @param state input char to convert 276 * @return output char from 0 to 0-num_states or STATE_INVALID or STATE_UNKNOWN 277 */ 278 StateType convertState(char state); 279 280 //virtual void convertStateStr(string &str, SeqType seq_type); 281 282 /** 283 * convert from internal state to user-readable state (e.g., to ACGT for DNA) 284 * Note: does not work for codon data 285 * @param state internal state code 286 * @return user-readable state 287 */ 288 char convertStateBack(char state); 289 290 /** 291 * convert from internal state to user-readable state (e.g., to ACGT for DNA) 292 * Note: work for all data 293 * @param state internal state code 294 * @return user-readable state string 295 */ 296 string convertStateBackStr(StateType state); 297 298 /** 299 get alignment site range from the residue range relative to a sequence 300 @param seq_id reference sequence 301 @param residue_left (IN/OUT) left of range 302 @param residue_right (IN/OUT) right of range [left,right) 303 @return TRUE if success, FALSE if out of range 304 */ 305 bool getSiteFromResidue(int seq_id, int &residue_left, int &residue_right); 306 307 int buildRetainingSites(const char *aln_site_list, IntVector &kept_sites, 308 int exclude_sites, const char *ref_seq_name); 309 310 void printAlignment(InputType format, const char *filename, bool append = false, const char *aln_site_list = NULL, 311 int exclude_sites = 0, const char *ref_seq_name = NULL); 312 313 virtual void printAlignment(InputType format, ostream &out, bool append = false, const char *aln_site_list = NULL, 314 int exclude_sites = 0, const char *ref_seq_name = NULL); 315 316 void printPhylip(ostream &out, bool append = false, const char *aln_site_list = NULL, 317 int exclude_sites = 0, const char *ref_seq_name = NULL, bool print_taxid = false); 318 319 void printFasta(ostream &out, bool append = false, const char *aln_site_list = NULL, 320 int exclude_sites = 0, const char *ref_seq_name = NULL); 321 322 void printNexus(ostream &out, bool append = false, const char *aln_site_list = NULL, 323 int exclude_sites = 0, const char *ref_seq_name = NULL, bool print_taxid = false); 324 /** 325 Print the number of gaps per site 326 @param filename output file name 327 */ 328 void printSiteGaps(const char *filename); 329 330 /**************************************************************************** 331 get general information from alignment 332 ****************************************************************************/ 333 334 /** 335 @return number of sequences 336 */ getNSeq()337 inline int getNSeq() { 338 return seq_names.size(); 339 } 340 341 /** 342 @return number of sites (alignment columns) 343 */ getNSite()344 inline int getNSite() { 345 return site_pattern.size(); 346 } 347 348 /** 349 @return number of patterns 350 */ getNPattern()351 inline int getNPattern() { 352 return size(); 353 } 354 getPatternID(int site)355 inline int getPatternID(int site) { 356 return site_pattern[site]; 357 } 358 getPattern(int site)359 inline Pattern getPattern(int site) { 360 return at(site_pattern[site]); 361 } 362 363 /** 364 * @param pattern_index (OUT) vector of size = alignment length storing pattern index of all sites 365 */ getSitePatternIndex(IntVector & pattern_index)366 virtual void getSitePatternIndex(IntVector &pattern_index) { 367 pattern_index = site_pattern; 368 } 369 370 /** 371 * @param freq (OUT) vector of site-pattern frequencies 372 */ 373 virtual void getPatternFreq(IntVector &freq); 374 375 /** 376 * @param[out] freq vector of site-pattern frequencies 377 */ 378 virtual void getPatternFreq(int *freq); 379 380 /** 381 @param i sequence index 382 @return sequence name 383 */ 384 string &getSeqName(int i); 385 386 /** 387 * Get a list of all sequence names 388 * @return vector containing the sequence names 389 */ 390 vector<string>& getSeqNames(); 391 392 /** 393 @param seq_name sequence name 394 @return corresponding ID, -1 if not found 395 */ 396 int getSeqID(string &seq_name); 397 398 /** 399 @return length of the longest sequence name 400 */ 401 int getMaxSeqNameLength(); 402 403 /* 404 check if some state is absent, which may cause numerical issues 405 @param msg additional message into the warning 406 @return number of absent states in the alignment 407 */ 408 virtual int checkAbsentStates(string msg); 409 410 /** 411 check proper and undupplicated sequence names 412 */ 413 void checkSeqName(); 414 415 /** 416 * check identical sequences 417 * @return the number of sequences that are identical to one of the sequences 418 */ 419 int checkIdenticalSeq(); 420 421 /** 422 * remove identical sequences from alignment 423 * @param not_remove name of sequence where removal is avoided 424 * @param keep_two TRUE to keep 2 out of k identical sequences, false to keep only 1 425 * @param removed_seqs (OUT) name of removed sequences 426 * @param target_seqs (OUT) corresponding name of kept sequence that is identical to the removed sequences 427 * @return this if no sequences were removed, or new alignment if at least 1 sequence was removed 428 */ 429 virtual Alignment *removeIdenticalSeq(string not_remove, bool keep_two, StrVector &removed_seqs, StrVector &target_seqs); 430 431 /** 432 Quit if some sequences contain only gaps or missing data 433 */ 434 virtual void checkGappySeq(bool force_error = true); 435 436 /** 437 * return a new alignment if some sequence is totally gappy, or this if all sequence are okey 438 */ 439 Alignment *removeGappySeq(); 440 441 /** 442 @return TRUE if seq_id contains only gaps or missing characters 443 @param seq_id sequence ID 444 */ 445 bool isGapOnlySeq(int seq_id); 446 isSuperAlignment()447 virtual bool isSuperAlignment() { 448 return false; 449 } 450 451 /**************************************************************************** 452 alignment general processing 453 ****************************************************************************/ 454 455 /** 456 extract sub-alignment of a sub-set of sequences 457 @param aln original input alignment 458 @param seq_id ID of sequences to extract from 459 @param min_true_cher the minimum number of non-gap characters, true_char<min_true_char -> delete the sequence 460 @param min_taxa only keep alignment that has >= min_taxa sequences 461 @param[out] kept_partitions (for SuperAlignment) indices of kept partitions 462 */ 463 virtual void extractSubAlignment(Alignment *aln, IntVector &seq_id, int min_true_char, int min_taxa = 0, IntVector *kept_partitions = NULL); 464 465 /** 466 extract a sub-set of patterns 467 @param aln original input alignment 468 @param ptn_id ID of patterns to extract from 469 */ 470 void extractPatterns(Alignment *aln, IntVector &ptn_id); 471 472 /** 473 extract a sub-set of patterns 474 @param aln original input alignment 475 @param ptn_freq pattern frequency to extract from 476 */ 477 void extractPatternFreqs(Alignment *aln, IntVector &ptn_freq); 478 479 /** 480 create a non-parametric bootstrap alignment from an input alignment 481 @param aln input alignment 482 @param pattern_freq (OUT) resampled pattern frequencies if not NULL 483 @param spec bootstrap specification of the form "l1:b1,l2:b2,...,lk:bk" 484 to randomly draw b1 sites from the first l1 sites, etc. Note that l1+l2+...+lk 485 must equal m, where m is the alignment length. Otherwise, an error will occur. 486 If spec == NULL, a standard procedure is applied, i.e., randomly draw m sites. 487 */ 488 virtual void createBootstrapAlignment(Alignment *aln, IntVector* pattern_freq = NULL, const char *spec = NULL); 489 490 /** 491 resampling pattern frequency by a non-parametric bootstrap 492 @param pattern_freq (OUT) resampled pattern frequencies 493 @param spec bootstrap specification, see above 494 */ 495 virtual void createBootstrapAlignment(IntVector &pattern_freq, const char *spec = NULL); 496 497 /** 498 resampling pattern frequency by a non-parametric bootstrap 499 @param pattern_freq (OUT) resampled pattern frequencies 500 @param spec bootstrap specification, see above 501 @param rstream random generator stream, NULL to use the global randstream 502 */ 503 virtual void createBootstrapAlignment(int *pattern_freq, const char *spec = NULL, int *rstream = NULL); 504 505 /** 506 Diep: This is for UFBoot2-Corr 507 Initialize "this" alignment as a bootstrap alignment 508 @param aln: the reference to the original alignment 509 @new_pattern_freqs: the frequencies of patterns to be present in bootstrap aln 510 */ 511 void buildFromPatternFreq(Alignment & aln, IntVector new_pattern_freqs); 512 513 /** 514 create a gap masked alignment from an input alignment. Gap patterns of masked_aln 515 will be superimposed into aln to create the current alignment object. 516 @param aln input alignment 517 @param masked_aln gappy alignment of the same size with aln 518 */ 519 void createGapMaskedAlignment(Alignment *masked_aln, Alignment *aln); 520 521 /** 522 * shuffle alignment by randomizing the order of sites 523 */ 524 virtual void shuffleAlignment(); 525 526 /** 527 concatenate an alignment into the current alignment object 528 @param aln an alignment of the same number of sequences and sequence names 529 */ 530 void concatenateAlignment(Alignment *aln); 531 532 /** 533 copy the input alignment into the current alignment object 534 @param aln input alignment 535 */ 536 void copyAlignment(Alignment *aln); 537 538 /** 539 extract a sub-set of sites 540 @param aln original input alignment 541 @param ptn_id ID of sites to extract from (starting from 0) 542 */ 543 void extractSites(Alignment *aln, IntVector &site_id); 544 545 /** 546 extract a sub-set of sites 547 @param aln original input alignment 548 @param spec specification of positions, e.g. "1-100,101-200\2" 549 */ 550 void extractSites(Alignment *aln, const char* spec); 551 552 /** 553 convert a DNA alignment into codon or AA alignment 554 */ 555 void convertToCodonOrAA(Alignment *aln, char *gene_code_id, bool nt2aa = false); 556 557 /** 558 convert this codon alignment to AA 559 */ 560 Alignment *convertCodonToAA(); 561 562 /** 563 convert this codon alignment to DNA 564 */ 565 Alignment *convertCodonToDNA(); 566 567 /** 568 @param quartet ID of four taxa 569 @param[out] support number of sites supporting 12|34, 13|24 and 14|23 570 */ 571 virtual void computeQuartetSupports(IntVector &quartet, vector<int64_t> &support); 572 573 /**************************************************************************** 574 Distance functions 575 ****************************************************************************/ 576 577 578 /** 579 compute the observed distance (number of different pairs of positions per site) 580 between two sequences 581 @param seq1 index of sequence 1 582 @param seq2 index of sequence 2 583 @return the observed distance between seq1 and seq2 (between 0.0 and 1.0) 584 */ 585 virtual double computeObsDist(int seq1, int seq2); 586 587 /** 588 @param seq1 index of sequence 1 589 @param seq2 index of sequence 2 590 @return Juke-Cantor correction distance between seq1 and seq2 591 */ 592 double computeJCDist(int seq1, int seq2); 593 594 /** 595 abstract function to compute the distance between 2 sequences. The default return 596 Juke-Cantor corrected distance. 597 @param seq1 index of sequence 1 598 @param seq2 index of sequence 2 599 @return any distance between seq1 and seq2 600 */ computeDist(int seq1,int seq2)601 virtual double computeDist(int seq1, int seq2) { 602 return computeJCDist(seq1, seq2); 603 } 604 605 606 /** 607 write distance matrix into a file in PHYLIP distance format 608 @param file_name distance file name 609 @param dist_mat distance matrix 610 */ 611 void printDist(const char *file_name, double *dist_mat); 612 613 /** 614 write distance matrix into a stream in PHYLIP distance format 615 @param out output stream 616 @param dist_mat distance matrix 617 */ 618 void printDist(ostream &out, double *dist_mat); 619 620 /** 621 read distance matrix from a file in PHYLIP distance format 622 @param file_name distance file name 623 @param dist_mat distance matrix 624 @return the longest distance 625 */ 626 double readDist(const char *file_name, double *dist_mat); 627 628 /** 629 read distance matrix from a stream in PHYLIP distance format 630 @param in input stream 631 @param dist_mat distance matrix 632 */ 633 double readDist(istream &in, double *dist_mat); 634 635 636 /**************************************************************************** 637 some statistics 638 ****************************************************************************/ 639 640 /** 641 count occurrences for each state from 0 to STATE_UNKNOWN 642 @param[out] state_count counts for all states 643 @param num_unknown_states number of unknown states e.g. for missing data 644 */ 645 void countStates(size_t *state_count, size_t num_unknown_states); 646 647 /** 648 convert counts to frequencies using EM algorithm 649 @param[in] state_count counts for all states 650 @paramp[out] state_freq normalized state frequency vector 651 */ 652 void convertCountToFreq(size_t *state_count, double *state_freq); 653 654 /** 655 compute empirical state frequencies from the alignment 656 @param state_freq (OUT) is filled with state frequencies, assuming state_freq was allocated with 657 at least num_states entries. 658 @param num_unknown_states number of unknown states e.g. for missing data 659 */ 660 virtual void computeStateFreq(double *state_freq, size_t num_unknown_states = 0); 661 662 int convertPomoState(int state); 663 664 /** 665 * Compute the absolute frequencies of the different states. 666 * Helpful for models with many states (e.g., PoMo) to check the 667 * abundancy of states in the data. 668 * 669 * @param abs_state_freq (OUT) assumed to be at least of size 670 * num_states. 671 */ 672 void computeAbsoluteStateFreq(unsigned int *abs_state_freq); 673 674 /** 675 compute empirical state frequencies for each sequence 676 @param freq_per_sequence (OUT) state frequencies for each sequence, of size num_states*num_freq 677 */ 678 void computeStateFreqPerSequence (double *freq_per_sequence); 679 680 void countStatePerSequence (unsigned *count_per_sequence); 681 682 /** 683 * Make all frequencies a little different and non-zero 684 * @param stateFrqArr (IN/OUT) state frequencies 685 */ 686 void convfreq(double *stateFrqArr); 687 688 /** 689 * compute special empirical frequencies for codon alignment: 1x4, 3x4, 3x4C 690 * @param state_freq (OUT) is filled with state frequencies, assuming state_freq was allocated with 691 * at least num_states entries. 692 * @param freq either FREQ_CODON_1x4, FREQ_CODON_3x4, or FREQ_CODON_3x4C 693 * @param ntfreq (OUT) nucleotide frequencies, assuming of size 4 for F1x4 and of size 12 for F3x4. 694 */ 695 void computeCodonFreq(StateFreqType freq, double *state_freq, double *ntfreq); 696 697 /** 698 compute empirical substitution counts between state pairs 699 @param normalize true to normalize row sum to 1, false otherwise 700 @param[out] pair_freq matrix of size num_states*num_states 701 @param[out] state_freq vector of size num_states 702 */ 703 virtual void computeDivergenceMatrix(double *pair_freq, double *state_freq, bool normalize = true); 704 705 /** 706 perform matched-pair tests of symmetry of Lars Jermiin et al. 707 @param[out] sym results of test of symmetry 708 @param[out] marsym results of test of marginal symmetry 709 @param[out] intsym results of test of internal symmetry 710 @param out output stream to print results 711 @param rstream random stream to shuffle alignment columns 712 @param out_stat output stream to print pairwise statistics 713 */ 714 virtual void doSymTest(size_t vecid, vector<SymTestResult> &sym, vector<SymTestResult> &marsym, 715 vector<SymTestResult> &intsym, int *rstream = NULL, vector<SymTestStat> *stats = NULL); 716 717 /** 718 count the fraction of constant sites in the alignment, update the variable frac_const_sites 719 */ 720 virtual void countConstSite(); 721 722 /** 723 * generate uninformative patterns 724 */ 725 void generateUninfPatterns(StateType repeat, vector<StateType> &singleton, vector<int> &seq_pos, vector<Pattern> &unobserved_ptns); 726 727 /** 728 * @param missing_data TRUE for missing data aware correction (for Mark Holder) 729 * @param[out] unobserved_ptns unobserved constant patterns, each entry encoding for one constant character 730 */ 731 void getUnobservedConstPatterns(ASCType ASC_type, vector<Pattern> &unobserved_ptns); 732 733 /** 734 @return the number of ungappy and unambiguous characters from a sequence 735 @param seq_id sequence ID 736 */ 737 int countProperChar(int seq_id); 738 739 /** 740 @return unconstrained log-likelihood (without a tree) 741 */ 742 virtual double computeUnconstrainedLogL(); 743 744 /** 745 * @return number of states, if it is a partition model, return max num_states across all partitions 746 */ getMaxNumStates()747 virtual int getMaxNumStates() { return num_states; } 748 749 /** either SEQ_BINARY, SEQ_DNA, SEQ_PROTEIN, SEQ_MORPH, or SEQ_CODON */ 750 SeqType seq_type; 751 752 StateType STATE_UNKNOWN; 753 754 /** 755 fraction of constant sites 756 */ 757 double frac_const_sites; 758 759 /** 760 fraction of invariant sites, incl. const sites and site like G-S-GG-GGGG 761 */ 762 double frac_invariant_sites; 763 764 /** number of parsimony informative sites */ 765 int num_informative_sites; 766 767 /** number of variant sites */ 768 int num_variant_sites; 769 770 /** number of sites used for parsimony computation, can be informative or variant */ 771 int num_parsimony_sites; 772 773 /** 774 * map from 64 codon to non-stop codon index 775 */ 776 char *non_stop_codon; 777 778 /** 779 * For codon sequences: index of 61 non-stop codons to 64 codons 780 * For other sequences: NULL 781 */ 782 char *codon_table; 783 784 /** 785 * For codon_sequences: 64 amino-acid letters for genetic code of AAA,AAC,AAG,AAT,...,TTT 786 * For other sequences: NULL 787 */ 788 char *genetic_code; 789 790 /** 791 * Virtual population size for PoMo model 792 */ 793 int virtual_pop_size; 794 795 // TODO DS: Maybe change default to SAMPLING_WEIGHTED_HYPER. 796 /// The sampling method (defaults to SAMPLING_WEIGHTED_BINOM). 797 SamplingType pomo_sampling_method; 798 799 /** BQM: 2015-07-06, 800 for PoMo data: map from state ID to pair of base1 and base2 801 represented in the high 16-bit and the low 16-bit of uint32_t 802 for base1, bit0-1 is used to encode the base (A,G,C,T) and the remaining 14 bits store the count 803 same interpretation for base2 804 */ 805 vector<uint32_t> pomo_sampled_states; 806 IntIntMap pomo_sampled_states_index; // indexing, to quickly find if a PoMo-2-state is already present 807 808 /* for site-specific state frequency model with Huaichun, Edward, Andrew */ 809 810 /* site to model ID map */ 811 IntVector site_model; 812 813 /** site to state frequency vector */ 814 vector<double*> site_state_freq; 815 816 /** 817 * @return true if data type is SEQ_CODON and state is a stop codon 818 */ 819 bool isStopCodon(int state); 820 821 bool isStandardGeneticCode(); 822 823 /** 824 * @return number of non-stop codons in the genetic code 825 */ 826 int getNumNonstopCodons(); 827 828 /* build seq_states containing set of states per sequence 829 * @param add_unobs_const TRUE to add all unobserved constant states (for +ASC model) 830 */ 831 //virtual void buildSeqStates(vector<vector<int> > &seq_states, bool add_unobs_const = false); 832 833 /** Added by MA 834 Compute the probability of this alignment according to the multinomial distribution with parameters determined by the reference alignment 835 @param refAlign the reference alignment 836 @param prob (OUT) the returned probabilty 837 838 The probability is computed as follows: 839 - From the reference alignment, we count the relative pattern frequencies p_1 ... p_k (sum = 1) 840 - From THIS alignment, we have frequencies d_1 ... d_k (sum = len = nsite) 841 - Prob(THIS | refAlign) = nsite!/(d_1! * ... * d_k!) product(p_i^d_i) 842 */ 843 void multinomialProb(Alignment refAlign, double &prob); 844 845 /** Added by MA 846 Compute the probability of the `expected alignment' according to the multinomial distribution with parameters determined by the pattern's observed frequencies in THIS alignment. 847 The `expected alignment' consists of patterns with log-likelihoods (under some model+tree) given in the input file (logLL). 848 Note that order of the log-likelihoods in inputLL must corresponds to patterns in THIS alignment. 849 850 @param inputLL the input patterns log-likelihood vector 851 @param prob (OUT) the returned probability 852 */ 853 void multinomialProb(DoubleVector logLL, double &prob); 854 void multinomialProb(double *logLL, double &prob); 855 856 /** Adapted from MA 857 compute the probability of the alignment defined by pattern_freq given this alignment 858 */ 859 double multinomialProb(IntVector &pattern_freq); 860 861 862 /** 863 get the appearance for a state, helpful for ambigious states 864 865 For nucleotides, the appearances of A, and C are 1000 and 0100, 866 respectively. If a state is ambiguous, more than one 1 will show up. 867 The appearance of the unknown state is 1111. 868 869 @param state the state index 870 @param state_app (OUT) state appearance 871 */ 872 void getAppearance(StateType state, double *state_app); 873 874 void getAppearance(StateType state, StateBitset &state_app); 875 876 /** 877 * read site specific state frequency vectors from a file to create corresponding model 878 * update site_model and site_state_freq variables for this class 879 * @param aln input alignment 880 * @param site_freq_file file name 881 * @return TRUE if alignment needs to be changed, FALSE otherwise 882 */ 883 bool readSiteStateFreq(const char* site_freq_file); 884 885 886 protected: 887 888 889 /** 890 sequence names 891 */ 892 vector<string> seq_names; 893 894 /** 895 Site to pattern index 896 */ 897 IntVector site_pattern; 898 899 /** 900 hash map from pattern to index in the vector of patterns (the alignment) 901 */ 902 PatternIntMap pattern_index; 903 904 905 /** 906 * special initialization for codon sequences, e.g., setting #states, genetic_code 907 * @param sequence_type user-defined sequence type 908 */ 909 void initCodon(char *gene_code_id); 910 911 }; 912 913 914 void extractSiteID(Alignment *aln, const char* spec, IntVector &site_id); 915 916 /** 917 create a new Alignment object with possibility of comma-separated file names 918 @param aln_file alignment file name, can be a comma-separated list of file names 919 @param sequence_type sequence data type 920 @param input input file format 921 @param model_name model name 922 */ 923 Alignment *createAlignment(string aln_file, const char *sequence_type, InputType intype, string model_name); 924 925 #endif 926