1 //
2 // C++ Interface: alignment
3 //
4 // Description:
5 //
6 //
7 // Author: BUI Quang Minh, Steffen Klaere, Arndt von Haeseler <minh.bui@univie.ac.at>, (C) 2008
8 //
9 // Copyright: See COPYING file that comes with this distribution
10 //
11 //
12 #ifndef ALIGNMENT_H
13 #define ALIGNMENT_H
14 
15 #include <vector>
16 #include <bitset>
17 #include "pattern.h"
18 #include "ncl/ncl.h"
19 
20 const double MIN_FREQUENCY          = 0.0001;
21 const double MIN_FREQUENCY_DIFF     = 0.00001;
22 
23 const int NUM_CHAR = 256;
24 typedef bitset<NUM_CHAR> StateBitset;
25 
26 /** class storing results of symmetry tests */
27 class SymTestResult {
28 public:
SymTestResult()29     SymTestResult() {
30         significant_pairs = included_pairs = excluded_pairs = 0;
31         pvalue_binom = -1.0;
32     }
33 
34     /** compute pvalue using bionomial test */
35     void computePvalue();
36 
37     int significant_pairs; // number of significant sequence pairs
38     int included_pairs; // total number of included sequence pairs
39     int excluded_pairs; // number of excluded sequence pairs
40     double max_stat; // maximum of the pair statistics
41     double pvalue_binom; // pvalue of binomial test of symmetry
42     double pvalue_maxdiv; // p-value of the sequence pair with maximum divergence
43     double pvalue_perm; // p-value of permutation test of symmetry
44 };
45 
46 /** class storing all pairwise statistics */
47 class SymTestStat {
48 public:
SymTestStat()49     SymTestStat() {
50         part = 0;
51         seq1 = seq2 = 0;
52         chi2_sym = 0.0;
53         chi2_marsym = std::numeric_limits<double>::quiet_NaN();
54         chi2_intsym = std::numeric_limits<double>::quiet_NaN();
55         pval_sym = std::numeric_limits<double>::quiet_NaN();
56         pval_marsym = std::numeric_limits<double>::quiet_NaN();
57         pval_intsym = std::numeric_limits<double>::quiet_NaN();
58     }
59     int part; // partition ID
60     int seq1, seq2; // ID of sequence 1 and 2
61     double chi2_sym; // chi2 statistic test of symmetry
62     double chi2_marsym; // chi2 statistic test of marginal symmetry
63     double chi2_intsym; // chi2 statistic test of internal symmetry
64     double pval_sym; // chi2 p-value test of symmetry
65     double pval_marsym; // chi2 p-value test of marginal symmetry
66     double pval_intsym; // chi2 p-value test of internal symmetry
67 };
68 
69 std::ostream& operator<< (std::ostream& stream, const SymTestResult& res);
70 
71 #ifdef USE_HASH_MAP
72 struct hashPattern {
operatorhashPattern73     size_t operator()(const vector<StateType> &sp) const {
74         size_t sum = 0;
75         for (Pattern::const_iterator it = sp.begin(); it != sp.end(); it++)
76             sum = (*it) + (sum << 6) + (sum << 16) - sum;
77         return sum;
78     }
79 };
80 typedef unordered_map<vector<StateType>, int, hashPattern> PatternIntMap;
81 #else
82 typedef map<vector<StateType>, int> PatternIntMap;
83 #endif
84 
85 
86 constexpr int EXCLUDE_GAP   = 1; // exclude gaps
87 constexpr int EXCLUDE_INVAR = 2; // exclude invariant sites
88 constexpr int EXCLUDE_UNINF = 4; // exclude uninformative sites
89 
90 /**
91 Multiple Sequence Alignment. Stored by a vector of site-patterns
92 
93         @author BUI Quang Minh, Steffen Klaere, Arndt von Haeseler <minh.bui@univie.ac.at>
94  */
95 class Alignment : public vector<Pattern>, public CharSet, public StateSpace {
96     friend class SuperAlignment;
97     friend class SuperAlignmentUnlinked;
98 
99 public:
100 
101     /**
102             constructor
103      */
104     Alignment();
105 
106     /**
107             constructor
108             @param filename file name
109             @param sequence_type type of the sequence, either "BIN", "DNA", "AA", or NULL
110             @param intype (OUT) input format of the file
111      */
112     Alignment(char *filename, char *sequence_type, InputType &intype, string model);
113 
114     /**
115      constructor
116      @param data_block nexus DATA block
117      @param sequence_type type of the sequence, either "BIN", "DNA", "AA", or NULL
118      */
119     Alignment(NxsDataBlock *data_block, char *sequence_type, string model);
120 
121     /**
122             destructor
123      */
124     virtual ~Alignment();
125 
126 
127     /****************************************************************************
128             input alignment reader
129      ****************************************************************************/
130 
131     /** get the SeqType for a given string */
132     static SeqType getSeqType(const char *sequence_type);
133 
134 
135     /**
136             add a pattern into the alignment
137             @param pat the pattern
138             @param site the site index of the pattern from the alignment
139             @param freq frequency of pattern
140             @return TRUE if pattern contains only gaps or unknown char.
141                             In that case, the pattern won't be added.
142      */
143     bool addPattern(Pattern &pat, int site, int freq = 1);
144 
145 	/**
146 		determine if the pattern is constant. update the is_const variable.
147 	*/
148 	virtual void computeConst(Pattern &pat);
149 
150 
151     void printSiteInfoHeader(ostream& out, const char* filename, bool partition = false);
152     /**
153         Print all site information to a stream
154         @param out output stream
155         @param part_id partition ID, negative to omit
156     */
157     void printSiteInfo(ostream &out, int part_id);
158 
159     /**
160         Print all site information to a file
161         @param filename output file name
162     */
163     virtual void printSiteInfo(const char* filename);
164 
165     /**
166      * add const patterns into the alignment
167      * @param freq_const_pattern comma-separated list of const pattern frequencies
168      */
169     void addConstPatterns(char *freq_const_patterns);
170 
171     /**
172             read the alignment in NEXUS format
173             @param filename file name
174             @return 1 on success, 0 on failure
175      */
176     int readNexus(char *filename);
177 
178     int buildPattern(StrVector &sequences, char *sequence_type, int nseq, int nsite);
179 
180     /**
181             read the alignment in PHYLIP format (interleaved)
182             @param filename file name
183             @param sequence_type type of the sequence, either "BIN", "DNA", "AA", or NULL
184             @return 1 on success, 0 on failure
185      */
186     int readPhylip(char *filename, char *sequence_type);
187 
188     /**
189             read the alignment in sequential PHYLIP format
190             @param filename file name
191             @param sequence_type type of the sequence, either "BIN", "DNA", "AA", or NULL
192             @return 1 on success, 0 on failure
193      */
194     int readPhylipSequential(char *filename, char *sequence_type);
195 
196     /**
197             read the alignment in FASTA format
198             @param filename file name
199             @param sequence_type type of the sequence, either "BIN", "DNA", "AA", or NULL
200             @return 1 on success, 0 on failure
201      */
202     int readFasta(char *filename, char *sequence_type);
203 
204     /**
205      * Read the alignment in counts format (PoMo).
206      *
207      * TODO: Allow noninformative sites (where no base is present).
208      *
209      * @param filename file name
210      * @param sequence_type sequence type (i.e., "CF10")
211      *
212      * @return 1 on success, 0 on failure
213      */
214     int readCountsFormat(char *filename, char *sequence_type);
215 
216     /**
217             read the alignment in CLUSTAL format
218             @param filename file name
219             @param sequence_type type of the sequence, either "BIN", "DNA", "AA", or NULL
220             @return 1 on success, 0 on failure
221      */
222     int readClustal(char *filename, char *sequence_type);
223 
224     /**
225             read the alignment in MSF format
226             @param filename file name
227             @param sequence_type type of the sequence, either "BIN", "DNA", "AA", or NULL
228             @return 1 on success, 0 on failure
229      */
230     int readMSF(char *filename, char *sequence_type);
231 
232     /**
233             extract the alignment from a nexus data block, called by readNexus()
234             @param data_block data block of nexus file
235      */
236     void extractDataBlock(NxsCharactersBlock *data_block);
237 
238     vector<Pattern> ordered_pattern;
239 
240     /** lower bound of sum parsimony scores for remaining pattern in ordered_pattern */
241     UINT *pars_lower_bound;
242 
243     /** order pattern by number of character states and return in ptn_order
244         @param pat_type either PAT_INFORMATIVE or 0
245     */
246     virtual void orderPatternByNumChars(int pat_type);
247 
248     /**
249      * un-group site-patterns, i.e., making #sites = #patterns and pattern frequency = 1 for all patterns
250      */
251     void ungroupSitePattern();
252 
253 
254     /**
255      * re-group site-patterns
256      * @param groups number of groups
257      * @param site_group group ID (0, 1, ...ngroups-1; must be continuous) of all sites
258      */
259     void regroupSitePattern(int groups, IntVector &site_group);
260 
261 
262     /****************************************************************************
263             output alignment
264      ****************************************************************************/
265     SeqType detectSequenceType(StrVector &sequences);
266 
267     void computeUnknownState();
268 
269     void buildStateMap(char *map, SeqType seq_type);
270 
271     virtual StateType convertState(char state, SeqType seq_type);
272 
273     /**
274      * convert state if the number of states (num_states is known)
275      * @param state input char to convert
276      * @return output char from 0 to 0-num_states or STATE_INVALID or STATE_UNKNOWN
277      */
278     StateType convertState(char state);
279 
280     //virtual void convertStateStr(string &str, SeqType seq_type);
281 
282 	/**
283 	 * convert from internal state to user-readable state (e.g., to ACGT for DNA)
284 	 * Note: does not work for codon data
285 	 * @param state internal state code
286 	 * @return user-readable state
287 	 */
288     char convertStateBack(char state);
289 
290     /**
291 	 * convert from internal state to user-readable state (e.g., to ACGT for DNA)
292 	 * Note: work for all data
293 	 * @param state internal state code
294 	 * @return user-readable state string
295 	 */
296 	string convertStateBackStr(StateType state);
297 
298 	/**
299             get alignment site range from the residue range relative to a sequence
300             @param seq_id reference sequence
301             @param residue_left (IN/OUT) left of range
302             @param residue_right (IN/OUT) right of range [left,right)
303             @return TRUE if success, FALSE if out of range
304      */
305     bool getSiteFromResidue(int seq_id, int &residue_left, int &residue_right);
306 
307     int buildRetainingSites(const char *aln_site_list, IntVector &kept_sites,
308             int exclude_sites, const char *ref_seq_name);
309 
310     void printAlignment(InputType format, const char *filename, bool append = false, const char *aln_site_list = NULL,
311     		int exclude_sites = 0, const char *ref_seq_name = NULL);
312 
313     virtual void printAlignment(InputType format, ostream &out, bool append = false, const char *aln_site_list = NULL,
314                         int exclude_sites = 0, const char *ref_seq_name = NULL);
315 
316     void printPhylip(ostream &out, bool append = false, const char *aln_site_list = NULL,
317     		int exclude_sites = 0, const char *ref_seq_name = NULL, bool print_taxid = false);
318 
319     void printFasta(ostream &out, bool append = false, const char *aln_site_list = NULL,
320     		int exclude_sites = 0, const char *ref_seq_name = NULL);
321 
322     void printNexus(ostream &out, bool append = false, const char *aln_site_list = NULL,
323                     int exclude_sites = 0, const char *ref_seq_name = NULL, bool print_taxid = false);
324     /**
325             Print the number of gaps per site
326             @param filename output file name
327      */
328     void printSiteGaps(const char *filename);
329 
330     /****************************************************************************
331             get general information from alignment
332      ****************************************************************************/
333 
334     /**
335             @return number of sequences
336      */
getNSeq()337     inline int getNSeq() {
338         return seq_names.size();
339     }
340 
341     /**
342             @return number of sites (alignment columns)
343      */
getNSite()344     inline int getNSite() {
345         return site_pattern.size();
346     }
347 
348     /**
349              @return number of patterns
350      */
getNPattern()351     inline int getNPattern() {
352         return size();
353     }
354 
getPatternID(int site)355     inline int getPatternID(int site) {
356         return site_pattern[site];
357     }
358 
getPattern(int site)359     inline Pattern getPattern(int site) {
360         return at(site_pattern[site]);
361     }
362 
363     /**
364      * @param pattern_index (OUT) vector of size = alignment length storing pattern index of all sites
365      */
getSitePatternIndex(IntVector & pattern_index)366     virtual void getSitePatternIndex(IntVector &pattern_index) {
367         pattern_index = site_pattern;
368     }
369 
370     /**
371      * @param freq (OUT) vector of site-pattern frequencies
372      */
373     virtual void getPatternFreq(IntVector &freq);
374 
375     /**
376      * @param[out] freq vector of site-pattern frequencies
377      */
378     virtual void getPatternFreq(int *freq);
379 
380     /**
381             @param i sequence index
382             @return sequence name
383      */
384     string &getSeqName(int i);
385 
386     /**
387      *  Get a list of all sequence names
388      *  @return vector containing the sequence names
389      */
390     vector<string>& getSeqNames();
391 
392     /**
393             @param seq_name sequence name
394             @return corresponding ID, -1 if not found
395      */
396     int getSeqID(string &seq_name);
397 
398     /**
399             @return length of the longest sequence name
400      */
401     int getMaxSeqNameLength();
402 
403     /*
404         check if some state is absent, which may cause numerical issues
405         @param msg additional message into the warning
406         @return number of absent states in the alignment
407     */
408     virtual int checkAbsentStates(string msg);
409 
410     /**
411             check proper and undupplicated sequence names
412      */
413     void checkSeqName();
414 
415     /**
416      * check identical sequences
417      * @return the number of sequences that are identical to one of the sequences
418      */
419     int checkIdenticalSeq();
420 
421     /**
422      * remove identical sequences from alignment
423      * @param not_remove name of sequence where removal is avoided
424      * @param keep_two TRUE to keep 2 out of k identical sequences, false to keep only 1
425      * @param removed_seqs (OUT) name of removed sequences
426      * @param target_seqs (OUT) corresponding name of kept sequence that is identical to the removed sequences
427      * @return this if no sequences were removed, or new alignment if at least 1 sequence was removed
428      */
429     virtual Alignment *removeIdenticalSeq(string not_remove, bool keep_two, StrVector &removed_seqs, StrVector &target_seqs);
430 
431     /**
432             Quit if some sequences contain only gaps or missing data
433      */
434 	virtual void checkGappySeq(bool force_error = true);
435 
436 	/**
437 	 * return a new alignment if some sequence is totally gappy, or this if all sequence are okey
438 	 */
439 	Alignment *removeGappySeq();
440 
441     /**
442             @return TRUE if seq_id contains only gaps or missing characters
443             @param seq_id sequence ID
444      */
445     bool isGapOnlySeq(int seq_id);
446 
isSuperAlignment()447     virtual bool isSuperAlignment() {
448         return false;
449     }
450 
451     /****************************************************************************
452             alignment general processing
453      ****************************************************************************/
454 
455     /**
456             extract sub-alignment of a sub-set of sequences
457             @param aln original input alignment
458             @param seq_id ID of sequences to extract from
459             @param min_true_cher the minimum number of non-gap characters, true_char<min_true_char -> delete the sequence
460             @param min_taxa only keep alignment that has >= min_taxa sequences
461             @param[out] kept_partitions (for SuperAlignment) indices of kept partitions
462      */
463     virtual void extractSubAlignment(Alignment *aln, IntVector &seq_id, int min_true_char, int min_taxa = 0, IntVector *kept_partitions = NULL);
464 
465     /**
466             extract a sub-set of patterns
467             @param aln original input alignment
468             @param ptn_id ID of patterns to extract from
469      */
470     void extractPatterns(Alignment *aln, IntVector &ptn_id);
471 
472     /**
473             extract a sub-set of patterns
474             @param aln original input alignment
475             @param ptn_freq pattern frequency to extract from
476      */
477     void extractPatternFreqs(Alignment *aln, IntVector &ptn_freq);
478 
479     /**
480             create a non-parametric bootstrap alignment from an input alignment
481             @param aln input alignment
482             @param pattern_freq (OUT) resampled pattern frequencies if not NULL
483             @param spec bootstrap specification of the form "l1:b1,l2:b2,...,lk:bk"
484             	to randomly draw b1 sites from the first l1 sites, etc. Note that l1+l2+...+lk
485             	must equal m, where m is the alignment length. Otherwise, an error will occur.
486             	If spec == NULL, a standard procedure is applied, i.e., randomly draw m sites.
487      */
488     virtual void createBootstrapAlignment(Alignment *aln, IntVector* pattern_freq = NULL, const char *spec = NULL);
489 
490     /**
491             resampling pattern frequency by a non-parametric bootstrap
492             @param pattern_freq (OUT) resampled pattern frequencies
493             @param spec bootstrap specification, see above
494      */
495     virtual void createBootstrapAlignment(IntVector &pattern_freq, const char *spec = NULL);
496 
497     /**
498             resampling pattern frequency by a non-parametric bootstrap
499             @param pattern_freq (OUT) resampled pattern frequencies
500             @param spec bootstrap specification, see above
501             @param rstream random generator stream, NULL to use the global randstream
502      */
503     virtual void createBootstrapAlignment(int *pattern_freq, const char *spec = NULL, int *rstream = NULL);
504 
505 	/**
506 			Diep: This is for UFBoot2-Corr
507 			Initialize "this" alignment as a bootstrap alignment
508 			@param aln: the reference to the original alignment
509 			@new_pattern_freqs: the frequencies of patterns to be present in bootstrap aln
510 	 */
511 	void buildFromPatternFreq(Alignment & aln, IntVector new_pattern_freqs);
512 
513     /**
514             create a gap masked alignment from an input alignment. Gap patterns of masked_aln
515                     will be superimposed into aln to create the current alignment object.
516             @param aln input alignment
517             @param masked_aln gappy alignment of the same size with aln
518      */
519     void createGapMaskedAlignment(Alignment *masked_aln, Alignment *aln);
520 
521     /**
522 	 * shuffle alignment by randomizing the order of sites
523 	 */
524 	virtual void shuffleAlignment();
525 
526 	/**
527             concatenate an alignment into the current alignment object
528             @param aln an alignment of the same number of sequences and sequence names
529      */
530     void concatenateAlignment(Alignment *aln);
531 
532     /**
533             copy the input alignment into the current alignment object
534             @param aln input alignment
535      */
536     void copyAlignment(Alignment *aln);
537 
538     /**
539             extract a sub-set of sites
540             @param aln original input alignment
541             @param ptn_id ID of sites to extract from (starting from 0)
542      */
543     void extractSites(Alignment *aln, IntVector &site_id);
544 
545     /**
546             extract a sub-set of sites
547             @param aln original input alignment
548             @param spec specification of positions, e.g. "1-100,101-200\2"
549      */
550     void extractSites(Alignment *aln, const char* spec);
551 
552     /**
553         convert a DNA alignment into codon or AA alignment
554     */
555     void convertToCodonOrAA(Alignment *aln, char *gene_code_id, bool nt2aa = false);
556 
557     /**
558      convert this codon alignment to AA
559      */
560     Alignment *convertCodonToAA();
561 
562     /**
563      convert this codon alignment to DNA
564      */
565     Alignment *convertCodonToDNA();
566 
567     /**
568      @param quartet ID of four taxa
569      @param[out] support number of sites supporting 12|34, 13|24 and 14|23
570      */
571     virtual void computeQuartetSupports(IntVector &quartet, vector<int64_t> &support);
572 
573     /****************************************************************************
574             Distance functions
575      ****************************************************************************/
576 
577 
578     /**
579             compute the observed distance (number of different pairs of positions per site)
580                     between two sequences
581             @param seq1 index of sequence 1
582             @param seq2 index of sequence 2
583             @return the observed distance between seq1 and seq2 (between 0.0 and 1.0)
584      */
585     virtual double computeObsDist(int seq1, int seq2);
586 
587     /**
588             @param seq1 index of sequence 1
589             @param seq2 index of sequence 2
590             @return Juke-Cantor correction distance between seq1 and seq2
591      */
592     double computeJCDist(int seq1, int seq2);
593 
594     /**
595             abstract function to compute the distance between 2 sequences. The default return
596             Juke-Cantor corrected distance.
597             @param seq1 index of sequence 1
598             @param seq2 index of sequence 2
599             @return any distance between seq1 and seq2
600      */
computeDist(int seq1,int seq2)601     virtual double computeDist(int seq1, int seq2) {
602         return computeJCDist(seq1, seq2);
603     }
604 
605 
606     /**
607             write distance matrix into a file in PHYLIP distance format
608             @param file_name distance file name
609             @param dist_mat distance matrix
610      */
611     void printDist(const char *file_name, double *dist_mat);
612 
613     /**
614             write distance matrix into a stream in PHYLIP distance format
615             @param out output stream
616             @param dist_mat distance matrix
617      */
618     void printDist(ostream &out, double *dist_mat);
619 
620     /**
621             read distance matrix from a file in PHYLIP distance format
622             @param file_name distance file name
623             @param dist_mat distance matrix
624             @return the longest distance
625      */
626     double readDist(const char *file_name, double *dist_mat);
627 
628     /**
629             read distance matrix from a stream in PHYLIP distance format
630             @param in input stream
631             @param dist_mat distance matrix
632      */
633     double readDist(istream &in, double *dist_mat);
634 
635 
636     /****************************************************************************
637             some statistics
638      ****************************************************************************/
639 
640     /**
641         count occurrences for each state from 0 to STATE_UNKNOWN
642         @param[out] state_count counts for all states
643         @param num_unknown_states number of unknown states e.g. for missing data
644      */
645     void countStates(size_t *state_count, size_t num_unknown_states);
646 
647     /**
648         convert counts to frequencies using EM algorithm
649         @param[in] state_count counts for all states
650         @paramp[out] state_freq normalized state frequency vector
651      */
652     void convertCountToFreq(size_t *state_count, double *state_freq);
653 
654     /**
655             compute empirical state frequencies from the alignment
656             @param state_freq (OUT) is filled with state frequencies, assuming state_freq was allocated with
657                     at least num_states entries.
658             @param num_unknown_states number of unknown states e.g. for missing data
659      */
660     virtual void computeStateFreq(double *state_freq, size_t num_unknown_states = 0);
661 
662     int convertPomoState(int state);
663 
664     /**
665      * Compute the absolute frequencies of the different states.
666      * Helpful for models with many states (e.g., PoMo) to check the
667      * abundancy of states in the data.
668      *
669      * @param abs_state_freq (OUT) assumed to be at least of size
670      * num_states.
671      */
672     void computeAbsoluteStateFreq(unsigned int *abs_state_freq);
673 
674     /**
675             compute empirical state frequencies for each sequence
676             @param freq_per_sequence (OUT) state frequencies for each sequence, of size num_states*num_freq
677      */
678     void computeStateFreqPerSequence (double *freq_per_sequence);
679 
680     void countStatePerSequence (unsigned *count_per_sequence);
681 
682     /**
683      * Make all frequencies a little different and non-zero
684      * @param stateFrqArr (IN/OUT) state frequencies
685      */
686     void convfreq(double *stateFrqArr);
687 
688     /**
689 	 * compute special empirical frequencies for codon alignment: 1x4, 3x4, 3x4C
690 	 * @param state_freq (OUT) is filled with state frequencies, assuming state_freq was allocated with
691 	 * at least num_states entries.
692 	 * @param freq either FREQ_CODON_1x4, FREQ_CODON_3x4, or FREQ_CODON_3x4C
693 	 * @param ntfreq (OUT) nucleotide frequencies, assuming of size 4 for F1x4 and of size 12 for F3x4.
694 	 */
695 	void computeCodonFreq(StateFreqType freq, double *state_freq, double *ntfreq);
696 
697 	/**
698             compute empirical substitution counts between state pairs
699             @param normalize true to normalize row sum to 1, false otherwise
700             @param[out] pair_freq matrix of size num_states*num_states
701             @param[out] state_freq vector of size num_states
702      */
703     virtual void computeDivergenceMatrix(double *pair_freq, double *state_freq, bool normalize = true);
704 
705     /**
706         perform matched-pair tests of symmetry of Lars Jermiin et al.
707         @param[out] sym results of test of symmetry
708         @param[out] marsym results of test of marginal symmetry
709         @param[out] intsym results of test of internal symmetry
710         @param out output stream to print results
711         @param rstream random stream to shuffle alignment columns
712         @param out_stat output stream to print pairwise statistics
713      */
714     virtual void doSymTest(size_t vecid, vector<SymTestResult> &sym, vector<SymTestResult> &marsym,
715                            vector<SymTestResult> &intsym, int *rstream = NULL, vector<SymTestStat> *stats = NULL);
716 
717     /**
718             count the fraction of constant sites in the alignment, update the variable frac_const_sites
719      */
720     virtual void countConstSite();
721 
722     /**
723      * generate uninformative patterns
724      */
725     void generateUninfPatterns(StateType repeat, vector<StateType> &singleton, vector<int> &seq_pos, vector<Pattern> &unobserved_ptns);
726 
727     /**
728      * @param missing_data TRUE for missing data aware correction (for Mark Holder)
729      * @param[out] unobserved_ptns unobserved constant patterns, each entry encoding for one constant character
730      */
731     void getUnobservedConstPatterns(ASCType ASC_type, vector<Pattern> &unobserved_ptns);
732 
733     /**
734             @return the number of ungappy and unambiguous characters from a sequence
735             @param seq_id sequence ID
736      */
737     int countProperChar(int seq_id);
738 
739     /**
740             @return unconstrained log-likelihood (without a tree)
741      */
742     virtual double computeUnconstrainedLogL();
743 
744     /**
745      * 	@return number of states, if it is a partition model, return max num_states across all partitions
746      */
getMaxNumStates()747     virtual int getMaxNumStates() { return num_states; }
748 
749     /** either SEQ_BINARY, SEQ_DNA, SEQ_PROTEIN, SEQ_MORPH, or SEQ_CODON */
750     SeqType seq_type;
751 
752     StateType STATE_UNKNOWN;
753 
754     /**
755             fraction of constant sites
756      */
757     double frac_const_sites;
758 
759     /**
760             fraction of invariant sites, incl. const sites and site like G-S-GG-GGGG
761      */
762     double frac_invariant_sites;
763 
764     /** number of parsimony informative sites */
765     int num_informative_sites;
766 
767     /** number of variant sites */
768     int num_variant_sites;
769 
770     /** number of sites used for parsimony computation, can be informative or variant */
771     int num_parsimony_sites;
772 
773 	/**
774 	 *  map from 64 codon to non-stop codon index
775 	 */
776     char *non_stop_codon;
777 
778 	/**
779 	 * For codon sequences: index of 61 non-stop codons to 64 codons
780 	 * For other sequences: NULL
781 	 */
782 	char *codon_table;
783 
784 	/**
785 	 * For codon_sequences: 64 amino-acid letters for genetic code of AAA,AAC,AAG,AAT,...,TTT
786 	 * For other sequences: NULL
787 	 */
788 	char *genetic_code;
789 
790 	/**
791 	 * Virtual population size for PoMo model
792 	 */
793 	int virtual_pop_size;
794 
795   // TODO DS: Maybe change default to SAMPLING_WEIGHTED_HYPER.
796   /// The sampling method (defaults to SAMPLING_WEIGHTED_BINOM).
797   SamplingType pomo_sampling_method;
798 
799   /** BQM: 2015-07-06,
800       for PoMo data: map from state ID to pair of base1 and base2
801       represented in the high 16-bit and the low 16-bit of uint32_t
802       for base1, bit0-1 is used to encode the base (A,G,C,T) and the remaining 14 bits store the count
803       same interpretation for base2
804   */
805   vector<uint32_t> pomo_sampled_states;
806   IntIntMap pomo_sampled_states_index; // indexing, to quickly find if a PoMo-2-state is already present
807 
808     /* for site-specific state frequency model with Huaichun, Edward, Andrew */
809 
810     /* site to model ID map */
811     IntVector site_model;
812 
813     /** site to state frequency vector */
814     vector<double*> site_state_freq;
815 
816     /**
817      * @return true if data type is SEQ_CODON and state is a stop codon
818      */
819     bool isStopCodon(int state);
820 
821     bool isStandardGeneticCode();
822 
823 	/**
824 	 * @return number of non-stop codons in the genetic code
825 	 */
826 	int getNumNonstopCodons();
827 
828     /* build seq_states containing set of states per sequence
829      * @param add_unobs_const TRUE to add all unobserved constant states (for +ASC model)
830      */
831     //virtual void buildSeqStates(vector<vector<int> > &seq_states, bool add_unobs_const = false);
832 
833     /** Added by MA
834             Compute the probability of this alignment according to the multinomial distribution with parameters determined by the reference alignment
835             @param refAlign the reference alignment
836             @param prob (OUT) the returned probabilty
837 
838             The probability is computed as follows:
839             - From the reference alignment, we count the relative pattern frequencies p_1 ... p_k (sum = 1)
840             - From THIS alignment, we have frequencies d_1 ... d_k (sum = len = nsite)
841             - Prob(THIS | refAlign) = nsite!/(d_1! * ... * d_k!) product(p_i^d_i)
842      */
843     void multinomialProb(Alignment refAlign, double &prob);
844 
845     /** Added by MA
846             Compute the probability of the `expected alignment' according to the multinomial distribution with parameters determined by the pattern's observed frequencies in THIS alignment.
847             The `expected alignment' consists of patterns with log-likelihoods (under some model+tree) given in the input file (logLL).
848             Note that order of the log-likelihoods in inputLL must corresponds to patterns in THIS alignment.
849 
850             @param inputLL the input patterns log-likelihood vector
851             @param prob (OUT) the returned probability
852      */
853     void multinomialProb(DoubleVector logLL, double &prob);
854     void multinomialProb(double *logLL, double &prob);
855 
856     /** Adapted from MA
857             compute the probability of the alignment defined by pattern_freq given this alignment
858      */
859     double multinomialProb(IntVector &pattern_freq);
860 
861 
862     /**
863             get the appearance for a state, helpful for ambigious states
864 
865             For nucleotides, the appearances of A, and C are 1000 and 0100,
866             respectively. If a state is ambiguous, more than one 1 will show up.
867             The appearance of the unknown state is 1111.
868 
869             @param state the state index
870             @param state_app (OUT) state appearance
871      */
872     void getAppearance(StateType state, double *state_app);
873 
874     void getAppearance(StateType state, StateBitset &state_app);
875 
876 	/**
877 	 * read site specific state frequency vectors from a file to create corresponding model
878      * update site_model and site_state_freq variables for this class
879 	 * @param aln input alignment
880 	 * @param site_freq_file file name
881      * @return TRUE if alignment needs to be changed, FALSE otherwise
882 	 */
883 	bool readSiteStateFreq(const char* site_freq_file);
884 
885 
886 protected:
887 
888 
889     /**
890             sequence names
891      */
892     vector<string> seq_names;
893 
894     /**
895             Site to pattern index
896      */
897     IntVector site_pattern;
898 
899     /**
900             hash map from pattern to index in the vector of patterns (the alignment)
901      */
902     PatternIntMap pattern_index;
903 
904 
905     /**
906 	 * special initialization for codon sequences, e.g., setting #states, genetic_code
907 	 * @param sequence_type user-defined sequence type
908 	 */
909 	void initCodon(char *gene_code_id);
910 
911 };
912 
913 
914 void extractSiteID(Alignment *aln, const char* spec, IntVector &site_id);
915 
916 /**
917  create a new Alignment object with possibility of comma-separated file names
918  @param aln_file alignment file name, can be a comma-separated list of file names
919  @param sequence_type sequence data type
920  @param input input file format
921  @param model_name model name
922  */
923 Alignment *createAlignment(string aln_file, const char *sequence_type, InputType intype, string model_name);
924 
925 #endif
926