1 /*
2 
3 HyPhy - Hypothesis Testing Using Phylogenies.
4 
5 Copyright (C) 1997-now
6 Core Developers:
7    Sergei L Kosakovsky Pond (sergeilkp@icloud.com)
8    Art FY Poon    (apoon42@uwo.ca)
9    Steven Weaver (sweaver@temple.edu)
10 
11 Module Developers:
12         Lance Hepler (nlhepler@gmail.com)
13         Martin Smith (martin.audacis@gmail.com)
14 
15 Significant contributions from:
16   Spencer V Muse (muse@stat.ncsu.edu)
17   Simon DW Frost (sdf22@cam.ac.uk)
18 
19 Permission is hereby granted, free of charge, to any person obtaining a
20 copy of this software and associated documentation files (the
21 "Software"), to deal in the Software without restriction, including
22 without limitation the rights to use, copy, modify, merge, publish,
23 distribute, sublicense, and/or sell copies of the Software, and to
24 permit persons to whom the Software is furnished to do so, subject to
25 the following conditions:
26 
27 The above copyright notice and this permission notice shall be included
28 in all copies or substantial portions of the Software.
29 
30 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
31 OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
32 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
33 IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
34 CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
35 TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
36 SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
37 
38 */
39 
40 #pragma once
41 
42 #include "global_things.h"
43 #include "list.h"
44 #include "stdlib.h"
45 #include "translation_table.h"
46 #include "site.h"
47 #include "function_templates.h"
48 
49 using namespace hy_global;
50 
51 #define HYPHY_SITE_DEFAULT_BUFFER_SIZE 512
52 #define DATA_SET_SWITCH_THRESHOLD 100000
53 
54 // data set file state data struct
55 struct _DSHelper {
56 
57     _SimpleList characterPositions;
58     _List       incompletePatternStorage;
59     _AVLListX*  incompletePatterns;
60 
_DSHelper_DSHelper61     _DSHelper(void) {
62         incompletePatterns = new _AVLListX (&incompletePatternStorage);
63     }
~_DSHelper_DSHelper64     ~_DSHelper(void) {
65         DeleteObject (incompletePatterns);
66     }
67 };
68 
69 
70 class _DataSet : public _List // a complete data set
71 {
72 public:
73   _DataSet(void);
74   _DataSet(long);
75   _DataSet(FILE *);
76   // with estimated number of sites per file
77   virtual ~_DataSet(void);
78 
79   virtual BaseRef makeDynamic(void) const;
80 
81   void AddSite(char);
82 
83   void Write2Site(long, char);
84 
85   void Finalize(void);
86   // remove duplicate data types and compress
87 
88   long GetNoTypes(void) const;
89   // return the number of unique sites
90 
91   unsigned long GetCharDimension(void) const;
92   // return the size of the alphabet space
93 
94   unsigned long GetFreqType(long) const;
95   // return the frequency of a site
96 
GetSite(long index)97   _Site *GetSite(long index) const {
98     return ((_Site **)list_data)[theMap.list_data[index]];
99   }
100 
101   long ComputeSize(void);
102   // compute the size of this object in memory
103 
104   void Clear(bool = true);
105 
106   virtual char operator()(unsigned long, unsigned long, unsigned int) const;
107   // retrieve element pos of site-th site
108 
109   virtual BaseRef toStr(unsigned long = 0UL);
110   // convert to string
111 
112   virtual void toFileStr(FILE *dest, unsigned long = 0UL);
113 
114   void Compact(long);
115   // release string overhead
116   void ConvertRepresentations(void);
117 
118   _Matrix *HarvestFrequencies(unsigned char, unsigned char, bool, _SimpleList &,
119                               _SimpleList &, bool = true) const;
120   // this function counts observed frequencies of elements in a data set
121   // unit is the length of an info unit (nucl - 1, codons - 3)
122   // atom is the "minimal" countable element (nucl - 1, codons - 1)
123   // posSpec - if position of an atom within an item is to be accounted for
124   // segmentation - partition of the underlying DataSet to look at
125   // null for segmentation assumes the entire dataset
126 
127   void MatchIndices(_Formula &, _SimpleList &, bool, long,
128                     _String const * = nil) const;
129   friend void printFileResults(_DataSet *);
InternalStorageMode(void)130   char InternalStorageMode(void) const { return useHorizontalRep; }
NoOfSpecies(void)131   unsigned long NoOfSpecies(void) const { return noOfSpecies; }
NoOfColumns(void)132   unsigned long NoOfColumns(void) const { return theMap.lLength; }
NoOfUniqueColumns(void)133   unsigned long NoOfUniqueColumns(void) const { return lLength; }
134   void AddName(_String const &);
135   void InsertName(_String const &name, long where);
136 
GetSequenceName(unsigned long i)137   _String *GetSequenceName(unsigned long i) const {
138     return (_String *)theNames.GetItem(i);
139   }
140 
GetNames(void)141   _List const &GetNames(void) const { return theNames; }
142 
ClearNames(void)143   void ClearNames(void) { theNames.Clear(); }
144 
145   _String *GetSequenceCharacters(long seqID) const;
146 
SetSequenceName(long index,_String * new_name)147   bool SetSequenceName(long index, _String *new_name) {
148     if (index >= 0L && index < theNames.lLength) {
149       theNames.Replace(index, new_name, false);
150       return true;
151     }
152     return false;
153   }
154 
SetNames(_List const & copy_from)155   void SetNames(_List const &copy_from) {
156     theNames.Clear();
157     theNames << copy_from;
158   }
159 
GetTheMap(void)160   _SimpleList &GetTheMap(void) { return theMap; }
161 
DuplicateMap(void)162   _SimpleList const &DuplicateMap(void) const { return theMap; }
163 
164   friend class _DataSetFilter;
165   friend _DataSet *ReadDataSetFile(hyFile *, char, _String *, _String *,
166                                    _String *, _TranslationTable *,
167                                    _ExecutionList *);
168   friend long ProcessLine(_String &s, FileState *fs, _DataSet &ds);
169 
170   static _DataSet *Concatenate(const _SimpleList &);
171   static _DataSet *Combine(const _SimpleList &);
172 
173   static _TranslationTable *CheckCompatibility(_SimpleList const &ref,
174                                                char concatOrCombine);
175 
176   void ProcessPartition(_String const &, _SimpleList &, bool, int unit_length,
177                         _SimpleList const * = nil, _SimpleList const * = nil,
178                         _String const *scope = nil) const;
179 
180   void SetTranslationTable(_DataSet *newTT);
181   void SetTranslationTable(_TranslationTable *newTT);
GetTT(void)182   _TranslationTable const *GetTT(void) const { return theTT; }
183   hyFloat CheckAlphabetConsistency(void);
184 
SetNoSpecies(unsigned long n)185   void SetNoSpecies(unsigned long n) { noOfSpecies = n; }
186   void ResetIHelper(void);
187 
188 private:
189   _SimpleList theMap,
190       theFrequencies; // remapping vector, and the counter of frequencies
191 
192   unsigned long noOfSpecies;
193 
194   _TranslationTable *theTT; // translation Table, if any
195 
196   _List theNames; // Names of species
197   FILE *streamThrough;
198 
199   _DSHelper *dsh;
200   bool useHorizontalRep;
201 };
202 
203 void ReadNextLine(hyFile *fp, _StringBuffer *s, FileState *fs, bool append = false,
204                   bool upCase = true);
205 
206 _DataSet *ReadDataSetFile(hyFile *, char = 0, _String * = nil, _String * = nil,
207                           _String * = nil,
208                           _TranslationTable * = &hy_default_translation_table,
209                           _ExecutionList *target = nil);
210 
211 
212 bool StoreADataSet(_DataSet *, _String *);
213 void    ReadNexusFile               (FileState& fState, hyFile*f, _DataSet& result);
214 
215 
216 extern _StringBuffer nexusBFBody;
217 extern _DataSet *lastNexusDataMatrix;
218 
219