1 /* 2 3 HyPhy - Hypothesis Testing Using Phylogenies. 4 5 Copyright (C) 1997-now 6 Core Developers: 7 Sergei L Kosakovsky Pond (sergeilkp@icloud.com) 8 Art FY Poon (apoon42@uwo.ca) 9 Steven Weaver (sweaver@temple.edu) 10 11 Module Developers: 12 Lance Hepler (nlhepler@gmail.com) 13 Martin Smith (martin.audacis@gmail.com) 14 15 Significant contributions from: 16 Spencer V Muse (muse@stat.ncsu.edu) 17 Simon DW Frost (sdf22@cam.ac.uk) 18 19 Permission is hereby granted, free of charge, to any person obtaining a 20 copy of this software and associated documentation files (the 21 "Software"), to deal in the Software without restriction, including 22 without limitation the rights to use, copy, modify, merge, publish, 23 distribute, sublicense, and/or sell copies of the Software, and to 24 permit persons to whom the Software is furnished to do so, subject to 25 the following conditions: 26 27 The above copyright notice and this permission notice shall be included 28 in all copies or substantial portions of the Software. 29 30 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 31 OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 32 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 33 IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 34 CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 35 TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 36 SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 37 38 */ 39 40 #pragma once 41 42 #include "global_things.h" 43 #include "list.h" 44 #include "stdlib.h" 45 #include "translation_table.h" 46 #include "site.h" 47 #include "function_templates.h" 48 49 using namespace hy_global; 50 51 #define HYPHY_SITE_DEFAULT_BUFFER_SIZE 512 52 #define DATA_SET_SWITCH_THRESHOLD 100000 53 54 // data set file state data struct 55 struct _DSHelper { 56 57 _SimpleList characterPositions; 58 _List incompletePatternStorage; 59 _AVLListX* incompletePatterns; 60 _DSHelper_DSHelper61 _DSHelper(void) { 62 incompletePatterns = new _AVLListX (&incompletePatternStorage); 63 } ~_DSHelper_DSHelper64 ~_DSHelper(void) { 65 DeleteObject (incompletePatterns); 66 } 67 }; 68 69 70 class _DataSet : public _List // a complete data set 71 { 72 public: 73 _DataSet(void); 74 _DataSet(long); 75 _DataSet(FILE *); 76 // with estimated number of sites per file 77 virtual ~_DataSet(void); 78 79 virtual BaseRef makeDynamic(void) const; 80 81 void AddSite(char); 82 83 void Write2Site(long, char); 84 85 void Finalize(void); 86 // remove duplicate data types and compress 87 88 long GetNoTypes(void) const; 89 // return the number of unique sites 90 91 unsigned long GetCharDimension(void) const; 92 // return the size of the alphabet space 93 94 unsigned long GetFreqType(long) const; 95 // return the frequency of a site 96 GetSite(long index)97 _Site *GetSite(long index) const { 98 return ((_Site **)list_data)[theMap.list_data[index]]; 99 } 100 101 long ComputeSize(void); 102 // compute the size of this object in memory 103 104 void Clear(bool = true); 105 106 virtual char operator()(unsigned long, unsigned long, unsigned int) const; 107 // retrieve element pos of site-th site 108 109 virtual BaseRef toStr(unsigned long = 0UL); 110 // convert to string 111 112 virtual void toFileStr(FILE *dest, unsigned long = 0UL); 113 114 void Compact(long); 115 // release string overhead 116 void ConvertRepresentations(void); 117 118 _Matrix *HarvestFrequencies(unsigned char, unsigned char, bool, _SimpleList &, 119 _SimpleList &, bool = true) const; 120 // this function counts observed frequencies of elements in a data set 121 // unit is the length of an info unit (nucl - 1, codons - 3) 122 // atom is the "minimal" countable element (nucl - 1, codons - 1) 123 // posSpec - if position of an atom within an item is to be accounted for 124 // segmentation - partition of the underlying DataSet to look at 125 // null for segmentation assumes the entire dataset 126 127 void MatchIndices(_Formula &, _SimpleList &, bool, long, 128 _String const * = nil) const; 129 friend void printFileResults(_DataSet *); InternalStorageMode(void)130 char InternalStorageMode(void) const { return useHorizontalRep; } NoOfSpecies(void)131 unsigned long NoOfSpecies(void) const { return noOfSpecies; } NoOfColumns(void)132 unsigned long NoOfColumns(void) const { return theMap.lLength; } NoOfUniqueColumns(void)133 unsigned long NoOfUniqueColumns(void) const { return lLength; } 134 void AddName(_String const &); 135 void InsertName(_String const &name, long where); 136 GetSequenceName(unsigned long i)137 _String *GetSequenceName(unsigned long i) const { 138 return (_String *)theNames.GetItem(i); 139 } 140 GetNames(void)141 _List const &GetNames(void) const { return theNames; } 142 ClearNames(void)143 void ClearNames(void) { theNames.Clear(); } 144 145 _String *GetSequenceCharacters(long seqID) const; 146 SetSequenceName(long index,_String * new_name)147 bool SetSequenceName(long index, _String *new_name) { 148 if (index >= 0L && index < theNames.lLength) { 149 theNames.Replace(index, new_name, false); 150 return true; 151 } 152 return false; 153 } 154 SetNames(_List const & copy_from)155 void SetNames(_List const ©_from) { 156 theNames.Clear(); 157 theNames << copy_from; 158 } 159 GetTheMap(void)160 _SimpleList &GetTheMap(void) { return theMap; } 161 DuplicateMap(void)162 _SimpleList const &DuplicateMap(void) const { return theMap; } 163 164 friend class _DataSetFilter; 165 friend _DataSet *ReadDataSetFile(hyFile *, char, _String *, _String *, 166 _String *, _TranslationTable *, 167 _ExecutionList *); 168 friend long ProcessLine(_String &s, FileState *fs, _DataSet &ds); 169 170 static _DataSet *Concatenate(const _SimpleList &); 171 static _DataSet *Combine(const _SimpleList &); 172 173 static _TranslationTable *CheckCompatibility(_SimpleList const &ref, 174 char concatOrCombine); 175 176 void ProcessPartition(_String const &, _SimpleList &, bool, int unit_length, 177 _SimpleList const * = nil, _SimpleList const * = nil, 178 _String const *scope = nil) const; 179 180 void SetTranslationTable(_DataSet *newTT); 181 void SetTranslationTable(_TranslationTable *newTT); GetTT(void)182 _TranslationTable const *GetTT(void) const { return theTT; } 183 hyFloat CheckAlphabetConsistency(void); 184 SetNoSpecies(unsigned long n)185 void SetNoSpecies(unsigned long n) { noOfSpecies = n; } 186 void ResetIHelper(void); 187 188 private: 189 _SimpleList theMap, 190 theFrequencies; // remapping vector, and the counter of frequencies 191 192 unsigned long noOfSpecies; 193 194 _TranslationTable *theTT; // translation Table, if any 195 196 _List theNames; // Names of species 197 FILE *streamThrough; 198 199 _DSHelper *dsh; 200 bool useHorizontalRep; 201 }; 202 203 void ReadNextLine(hyFile *fp, _StringBuffer *s, FileState *fs, bool append = false, 204 bool upCase = true); 205 206 _DataSet *ReadDataSetFile(hyFile *, char = 0, _String * = nil, _String * = nil, 207 _String * = nil, 208 _TranslationTable * = &hy_default_translation_table, 209 _ExecutionList *target = nil); 210 211 212 bool StoreADataSet(_DataSet *, _String *); 213 void ReadNexusFile (FileState& fState, hyFile*f, _DataSet& result); 214 215 216 extern _StringBuffer nexusBFBody; 217 extern _DataSet *lastNexusDataMatrix; 218 219