1 /*
2 
3  HyPhy - Hypothesis Testing Using Phylogenies.
4 
5  Copyright (C) 1997-now
6  Core Developers:
7  Sergei L Kosakovsky Pond (sergeilkp@icloud.com)
8  Art FY Poon    (apoon42@uwo.ca)
9  Steven Weaver (sweaver@temple.edu)
10 
11  Module Developers:
12  Lance Hepler (nlhepler@gmail.com)
13  Martin Smith (martin.audacis@gmail.com)
14 
15  Significant contributions from:
16  Spencer V Muse (muse@stat.ncsu.edu)
17  Simon DW Frost (sdf22@cam.ac.uk)
18 
19  Permission is hereby granted, free of charge, to any person obtaining a
20  copy of this software and associated documentation files (the
21  "Software"), to deal in the Software without restriction, including
22  without limitation the rights to use, copy, modify, merge, publish,
23  distribute, sublicense, and/or sell copies of the Software, and to
24  permit persons to whom the Software is furnished to do so, subject to
25  the following conditions:
26 
27  The above copyright notice and this permission notice shall be included
28  in all copies or substantial portions of the Software.
29 
30  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
31  OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
32  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
33  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
34  CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
35  TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
36  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
37 
38  */
39 
40 #include <ctype.h>
41 
42 #include "dataset.h"
43 #include "translation_table.h"
44 #include "batchlan.h"
45 #include "site.h"
46 #include "global_object_lists.h"
47 
48 using namespace hyphy_global_objects;
49 
50 
51 #define DATA_SET_SWITCH_THRESHOLD 100000
52 
53 
_DataSet(void)54 _DataSet::_DataSet(void) {
55   theTT = &hy_default_translation_table;
56   streamThrough = nil;
57   dsh = nil;
58   useHorizontalRep = false;
59 }
60 
_DataSet(long l)61 _DataSet::_DataSet(long l)
62     : _List((unsigned long)l),
63       theFrequencies(
64           (unsigned long)l) // with estimated number of sites per file
65 {
66   dsh = nil;
67   streamThrough = nil;
68   theTT = &hy_default_translation_table;
69   useHorizontalRep = false;
70 }
71 
72 //_______________________________________________________________________
73 
_DataSet(FILE * f)74 _DataSet::_DataSet(FILE *f) {
75   dsh = nil;
76   useHorizontalRep = false;
77   theTT = &hy_default_translation_table;
78   streamThrough = f;
79   theMap << 0; // current sequence
80   theMap << 0; // current site
81   theMap << 0; // total sites
82 }
83 
84 //_______________________________________________________________________
85 
~_DataSet(void)86 _DataSet::~_DataSet(void) {
87   if (theTT != &hy_default_translation_table) {
88     DeleteObject(theTT);
89   }
90 }
91 
92 //_______________________________________________________________________
93 
Clear(bool)94 void _DataSet::Clear(bool) {
95   _List::Clear();
96   theMap.Clear();
97   theFrequencies.Clear();
98   theNames.Clear();
99   if (theTT != &hy_default_translation_table) {
100     DeleteObject(theTT);
101     theTT = &hy_default_translation_table;
102   }
103   noOfSpecies = 0;
104   if (dsh) {
105     dsh->incompletePatterns->Clear(false);
106     delete (dsh);
107     dsh = nil;
108   }
109   useHorizontalRep = false;
110 }
111 
112 //_______________________________________________________________________
113 
makeDynamic(void) const114 BaseRef _DataSet::makeDynamic(void) const {
115   _DataSet *r = new _DataSet;
116   r->theMap.Duplicate(&theMap);
117   r->theFrequencies.Duplicate(&theFrequencies);
118   if (theTT != &hy_default_translation_table) {
119     r->theTT->AddAReference();
120   }
121   r->theNames.Duplicate(&theNames);
122   r->streamThrough = streamThrough;
123   // 20170507: SLKP TODO why do we need an additional reference here?
124   // nInstances++;
125   r->dsh = nil;
126   r->useHorizontalRep = false;
127   return r;
128 }
129 
130 //_______________________________________________________________________
131 
ResetIHelper(void)132 void _DataSet::ResetIHelper(void) {
133   if (dsh && dsh->characterPositions.lLength == 256)
134     for (long k = 0; k < 256; k++) {
135       dsh->characterPositions.list_data[k] = -1;
136     }
137 }
138 
139 //_______________________________________________________________________
140 
ConvertRepresentations(void)141 void _DataSet::ConvertRepresentations(void) {
142   if (useHorizontalRep == false) {
143     _List horStrings;
144 
145     if (lLength == 0) {
146       AppendNewInstance(new _StringBuffer (128UL));
147     } else {
148       _Site *aSite = (_Site *)list_data[0];
149 
150       for (long str = 0; str < aSite->length(); str++) {
151         horStrings < new _StringBuffer (DATA_SET_SWITCH_THRESHOLD);
152       }
153 
154       for (long s = 0; s < lLength; s++) {
155         _Site *aSite = (_Site *)list_data[s];
156         if (aSite->length() > horStrings.lLength || aSite->GetRefNo() != -1) {
157           HandleApplicationError("Irrecoverable internal error in "
158                                  "_DataSet::ConvertRepresentations. Sorry "
159                                  "about that.",
160                                  true);
161           return;
162         }
163 
164         for (long s2 = 0L; s2 < aSite->length(); s2++) {
165           (*(_StringBuffer *)horStrings.list_data[s2]) << aSite->get_char(s2);
166         }
167       }
168 
169       _List::Clear();
170       theFrequencies.Clear();
171       {
172         for (long s = 0; s < horStrings.lLength; s++) {
173           (*this) << horStrings(s);
174         }
175       }
176     }
177     useHorizontalRep = true;
178   }
179 }
180 
181 //_______________________________________________________________________
182 
AddSite(char c)183 void _DataSet::AddSite(char c) {
184   if (streamThrough) {
185     if (theMap.list_data[0] == 0) {
186       if (theMap.list_data[1] == 0) {
187         if (theNames.lLength) {
188           fprintf(streamThrough, ">%s\n", ((_String *)theNames(0))->get_str());
189         } else {
190           fprintf(streamThrough, ">Sequence 1\n");
191         }
192         AppendNewInstance(new _String(kEmptyString));
193       }
194 
195       theMap.list_data[1]++;
196       theMap.list_data[2]++;
197       fputc(c, streamThrough);
198     } else {
199       HandleApplicationError("Can't add more sites to a file based data set, "
200                              "when more that one sequence has been written!",
201                              true);
202     }
203   } else {
204     if (useHorizontalRep == false) {
205       if (lLength < DATA_SET_SWITCH_THRESHOLD) {
206         _Site *nC = new _Site(c);
207         theFrequencies << 1L;
208         AppendNewInstance(nC);
209         return;
210       } else {
211         ConvertRepresentations();
212       }
213     }
214 
215     (*((_StringBuffer *)list_data[0])) << c;
216   }
217 }
218 //_______________________________________________________________________
219 
Write2Site(long index,char c)220 void _DataSet::Write2Site(long index, char c) {
221   if (streamThrough) {
222     if (index == 0) {
223       if (theMap.list_data[2] == theMap.list_data[1]) {
224         theMap.list_data[0]++;
225 
226         if (theNames.lLength > theMap.list_data[0]) {
227           fprintf(streamThrough, "\n>%s\n",
228                   ((_String *)theNames(theMap.list_data[0]))->get_str());
229         } else {
230           fprintf(streamThrough, "\n>Sequence %ld\n", theMap.list_data[0] + 1);
231         }
232 
233         theMap.list_data[1] = 0;
234       } else {
235         HandleApplicationError("Can't write sequences of unequal lengths to a "
236                                "file based data set.");
237         return;
238       }
239     } else if (index != theMap.list_data[1]) {
240       HandleApplicationError("Can't write sites which are not consecutive to a "
241                              "file based data set.");
242       return;
243     }
244 
245     theMap.list_data[1]++;
246     fputc(c, streamThrough);
247   } else {
248     /*if (!dsh)
249     {
250         WarnError ("Internal Error in 'Write2Site' - called Write2Site before
251     any AddSite calls"); return;
252     }*/
253 
254     if (useHorizontalRep) {
255       long currentWritten = ((_String *)list_data[0])->length();
256 
257       if (index >= currentWritten) {
258         HandleApplicationError("Internal Error in 'Write2Site' - index is too "
259                                "high (using compact representation)");
260         return;
261       } else {
262         if (index == 0) {
263           _StringBuffer *newString = new _StringBuffer(currentWritten);
264           (*newString) << c;
265           (*this) < newString;
266         } else {
267           long s = 1;
268           for (; s < lLength; s++) {
269             _StringBuffer *aString = (_StringBuffer *)list_data[s];
270             if (aString->length() == index) {
271               (*aString) << c;
272               break;
273             }
274           }
275           if (s == lLength) {
276             HandleApplicationError("Internal Error in 'Write2Site' - no "
277                                    "appropriate  string to write too (compact "
278                                    "representation)");
279             return;
280           }
281         }
282       }
283     } else {
284       if (index >= lLength) {
285         HandleApplicationError(
286             "Internal Error in 'Write2Site' - index is too high");
287         return;
288       }
289       _Site *s = (_Site *)list_data[index];
290       long rN = s->GetRefNo();
291       if (rN == -1) { // independent site
292         // dsh->incompletePatterns->Delete (s,false);
293         (*s) << c;
294         // dsh->incompletePatterns->Insert (s,index);
295       } else {
296         _Site *ss = (_Site *)list_data[rN];
297         long sL = ss->length() - 1;
298         if (ss->get_char(sL) != c) { // appending distinct char
299           s->Duplicate(ss);
300           s->set_char(sL, c);
301           theFrequencies.list_data[rN]--;
302 
303           rN = dsh->incompletePatterns->Find(s);
304           if (rN >= 0) {
305             rN = dsh->incompletePatterns->GetXtra(rN);
306             /*_Site* s2 = (_Site*)list_data[rN];
307             if (s2->GetRefNo() != -1 || !s->Equal(s2))
308             {
309                 WarnError ("Mapping Error");
310             }*/
311             theFrequencies[rN]++;
312             s->Clear();
313             s->SetRefNo(rN);
314           } else {
315             theFrequencies[index]++;
316             s->SetRefNo(-1);
317             dsh->incompletePatterns->Insert(s, index);
318           }
319         }
320       }
321     }
322   }
323 }
324 
325 
326 
327 //_______________________________________________________________________
328 
GetCharDimension(void) const329 unsigned long _DataSet::GetCharDimension(
330     void) const { // return the size of the alphabet space
331   return theTT->baseLength;
332 }
333 
334 //_______________________________________________________________________
335 
GetNoTypes(void) const336 long _DataSet::GetNoTypes(void) const // return the number of unique columns
337 {
338   return theMap.countitems();
339 }
340 //_______________________________________________________________________
341 
342 unsigned long
GetFreqType(long index) const343 _DataSet::GetFreqType(long index) const { // return the frequency of a site
344   return theFrequencies(theMap(index));
345 }
346 //_______________________________________________________________________
347 
SetTranslationTable(_DataSet * newTT)348 void _DataSet::SetTranslationTable(_DataSet *newTT) {
349   if (theTT && (theTT != &hy_default_translation_table)) {
350     DeleteObject(theTT);
351   }
352   theTT = (_TranslationTable *)newTT->theTT->makeDynamic();
353 }
354 
355 //_______________________________________________________________________
356 
SetTranslationTable(_TranslationTable * newTT)357 void _DataSet::SetTranslationTable(_TranslationTable *newTT) {
358   if (theTT && (theTT != &hy_default_translation_table)) {
359     DeleteObject(theTT);
360   }
361   theTT = (_TranslationTable *)newTT->makeDynamic();
362 }
363 //_______________________________________________________________________
Finalize(void)364 void _DataSet::Finalize(void) {
365   if (streamThrough) {
366     fclose(streamThrough);
367     streamThrough = nil;
368     theMap.Clear();
369   } else {
370     if (useHorizontalRep) {
371       bool good = true;
372       for (long s = 0; s < lLength; s++) {
373         good = good &&
374                ((_String *)list_data[0])->length() == ((_String *)list_data[s])->length();
375       }
376 
377       if (!good) {
378         Clear();
379         HandleApplicationError("Internal Error in _DataSet::Finalize. Unequal "
380                                "sequence lengths in compact representation",
381                                true);
382         return;
383       }
384 
385       _List dups;
386       _List uniquePats;
387       _AVLListX dupsAVL(&dups);
388 
389       long siteCounter = ((_String *)list_data[0])->length();
390 
391       for (long i1 = 0L; i1 < siteCounter; i1++) {
392         _Site *tC = new _Site();
393 
394         for (long i2 = 0L; i2 < lLength; i2++) {
395           (*tC) << ((_String *)list_data[i2])->get_char(i1);
396         }
397 
398         long ff = dupsAVL.Find(tC);
399         if (ff < 0) {
400           uniquePats << tC;
401           dupsAVL.Insert(tC, theFrequencies.lLength);
402           theMap << theFrequencies.lLength;
403           theFrequencies << 1;
404         } else {
405           ff = dupsAVL.GetXtra(ff);
406           theMap << ff;
407           theFrequencies.list_data[ff]++;
408         }
409 
410         DeleteObject(tC);
411       }
412       dupsAVL.Clear(false);
413       _List::Clear();
414       _List::Duplicate(&uniquePats);
415     } else {
416       long j, k;
417 
418       _Site *tC;
419       {
420         _List dups;
421         _AVLListX dupsAVL(&dups);
422 
423         for (long i1 = 0; i1 < lLength; i1++) {
424           tC = (_Site *)list_data[i1];
425           long ff = dupsAVL.Find(tC);
426           if (ff < 0) {
427             dupsAVL.Insert(tC, i1);
428           } else {
429             ff = dupsAVL.GetXtra(ff);
430             tC->Clear();
431             tC->SetRefNo(ff);
432             theFrequencies.list_data[ff]++;
433           }
434         }
435         dupsAVL.Clear(false);
436       }
437 
438       _SimpleList refs(lLength), toDelete(lLength);
439       j = 0;
440 
441       for (long i1 = 0; i1 < lLength; i1++) {
442         tC = (_Site *)(*(_List *)this)(i1);
443         k = tC->GetRefNo();
444         if (k == -1) {
445           refs << j++;
446         } else {
447           toDelete << i1;
448           refs << -1;
449         }
450       }
451 
452       for (long i2 = 0; i2 < lLength; i2++) {
453         tC = (_Site *)(*(_List *)this)(i2);
454         k = tC->GetRefNo();
455         if (k >= 0) {
456           j = refs.list_data[k];
457           if (j < 0) {
458             HandleApplicationError(kErrorStringDatasetRefIndexError);
459           } else {
460             refs.list_data[i2] = j;
461           }
462         }
463       }
464 
465       theMap.Clear();
466       theMap.Duplicate(&refs);
467       DeleteList(toDelete);
468       theFrequencies.DeleteList(toDelete);
469 
470       for (long i3 = 0; i3 < lLength; i3++) {
471         tC = (_Site *)(*(_List *)this)(i3);
472         tC->TrimSpace();
473         tC->SetRefNo(0);
474       }
475       if (dsh) {
476         dsh->incompletePatterns->Clear(false);
477         delete (dsh);
478         dsh = nil;
479       }
480     }
481   }
482 }
483 //_______________________________________________________________________
Compact(long index)484 void _DataSet::Compact(long index) {
485   if (useHorizontalRep) {
486     HandleApplicationError(
487         "Internal Error: _DataSet::Compact called with compact represntation",
488         true);
489     return;
490   }
491 
492   _Site *tC = (_Site *)(*(_List *)this)(index);
493   if (tC->GetRefNo() != -1)
494   // take care of double referencing
495   {
496     _Site *tCC = tC;
497     long lastRef, count = 0;
498     do {
499       lastRef = tCC->GetRefNo();
500       count++;
501       tCC = (_Site *)(*(_List *)this)(tCC->GetRefNo());
502     } while (tCC->GetRefNo() != -1);
503     if (count > 1) {
504       theFrequencies[lastRef]++;
505     }
506 
507     tC->SetRefNo(lastRef);
508   }
509   /*if (tC->GetRefNo()==-1)
510   {
511    long f = dsh->incompletePatterns->Find(tC);
512    if (f >= 0)
513    {
514           f = dsh->incompletePatterns->GetXtra (f);
515           if (f<index)
516           {
517           theFrequencies[f]++;
518           tC->Clear();
519           tC->SetRefNo(f);
520       }
521       else
522           tC->Finalize();
523    }
524   }*/
525 }
526 
527 //_______________________________________________________________________
operator ()(unsigned long site,unsigned long pos,unsigned int) const528 inline char _DataSet::operator()(unsigned long site, unsigned long pos,
529                                  unsigned int) const {
530   return (((_String **)list_data)[theMap.list_data[site]])->get_char(pos);
531 }
532 
533 //_________________________________________________________
ComputeSize(void)534 long _DataSet::ComputeSize(void) {
535   long res = sizeof(_DataSet);
536 
537   res += (theMap.lLength + lLength + theFrequencies.lLength) * sizeof(long);
538   res += lLength * sizeof(_Site);
539 
540   for (long i = 0; i < lLength; i++) {
541     res += ((_Site *)(*(_List *)this)(i))->length();
542   }
543 
544   return res;
545 }
546 
547 //_________________________________________________________
CheckAlphabetConsistency(void)548 hyFloat _DataSet::CheckAlphabetConsistency(void) {
549   long charsIn = 0L, gaps = 0L, total = 0L;
550 
551   bool checks[256] = {false};
552 
553   char gapChar = theTT->GetGapChar();
554 
555   _String baseSymbols;
556 
557   if (theTT->baseSet.length()) {
558     baseSymbols = theTT->baseSet;
559   } else if (theTT->baseLength == 4) {
560     baseSymbols = "ACGUT";
561   } else if (theTT->baseLength == 20) {
562     baseSymbols =
563         _TranslationTable::GetDefaultTable(HY_TRANSLATION_TABLE_PROTEIN);
564   } else {
565     baseSymbols =
566         _TranslationTable::GetDefaultTable(HY_TRANSLATION_TABLE_BINARY);
567   }
568 
569   for (charsIn = 0; charsIn < baseSymbols.length(); charsIn++) {
570     checks[(unsigned char)baseSymbols.get_char(charsIn)] = true;
571   }
572 
573   charsIn = 0;
574 
575   for (long i = 0; i < lLength; i++) {
576     _String *thisColumn = (_String *)GetItem(i);
577     long w = theFrequencies.get(i);
578     for (long j = 0; j < thisColumn->length(); j++)
579       if (checks[(unsigned char)thisColumn->get_char(j)]) {
580         charsIn += w;
581       } else if (gapChar == thisColumn->get_char(j)) {
582         gaps += w;
583       }
584 
585     total += w * thisColumn->length();
586   }
587 
588   return (hyFloat)charsIn / (total - gaps + 1.);
589 }
590 
591 //___________________________________________________
592 
toStr(unsigned long)593 BaseRef _DataSet::toStr(unsigned long) {
594   _StringBuffer *s = new _StringBuffer(NoOfSpecies() * 30), *str;
595 
596   (*s) << _String((long)NoOfSpecies()) << " species:";
597 
598   (*s) << (_String *)theNames.toStr();
599 
600   (*s) << ";\nTotal Sites:" << _String((long)GetNoTypes())
601        << ";\nDistinct Sites:" << _String((long)theFrequencies.lLength);
602 
603   return s;
604 }
605 
606 //___________________________________________________
607 
toFileStr(FILE * dest,unsigned long padding)608 void _DataSet::toFileStr(FILE *dest, unsigned long padding) {
609   fprintf(dest, "%ld species: ", NoOfSpecies());
610   theNames.toFileStr(dest, padding);
611 
612   fprintf(dest, ";\nTotal Sites: %ld", GetNoTypes());
613   fprintf(dest, ";\nDistinct Sites: %ld", theFrequencies.lLength);
614 
615   /*  fprintf (dest,"\n");
616       for (long j=0; j<noOfSpecies;j++)
617       {
618           fprintf (dest,"\n");
619           for (long i=0; i<theMap.lLength; i++)
620           {
621               fprintf (dest,"%c",(*this)(i,j,1));
622           }
623       }*/
624 }
625 
626 //_________________________________________________________
627 
AddName(_String const & s)628 void _DataSet::AddName(_String const &s) {
629   theNames.AppendNewInstance(
630       new _String(s, 0, s.FirstNonSpaceIndex(0, -1, kStringDirectionBackward)));
631 }
632 
633 //_________________________________________________________
634 
InsertName(_String const & name,long where)635 void _DataSet::InsertName(_String const &name, long where) {
636   theNames.InsertElement(new _String(name), where, false);
637 }
638 
639 //_________________________________________________________
640 
MatchIndices(_Formula & f,_SimpleList & receptacle,bool isVert,long limit,_String const * scope) const641 void _DataSet::MatchIndices(_Formula &f, _SimpleList &receptacle, bool isVert,
642                             long limit, _String const *scope) const {
643   _String varName = isVert ? "siteIndex" : "speciesIndex";
644   varName = AppendContainerName(varName, scope);
645   _Variable *v = CheckReceptacle(&varName, kEmptyString, false);
646 
647   // fprintf (stderr, "\n_DataSet::MatchIndices %d %s [%s] %s\n", isVert, scope
648   // ? scope->sData : "none", varName.sData, ((_String*)f.toStr())->sData);
649 
650   for (long i = 0L; i < limit; i++) {
651     v->SetValue(new _Constant((hyFloat)i), false, i == 0, NULL);
652     HBLObjectRef res = f.Compute();
653     // fprintf (stderr, "%ld %g\n", i, res->Compute()->Value());
654     if (res && !CheckEqual(res->Value(), 0.0)) {
655       receptacle << i;
656     }
657   }
658   v->SetValue(new _Constant(0.0), false, false, NULL);
659 }
660 
661 //_________________________________________________________
662 
CheckCompatibility(_SimpleList const & ref,char concatOrCombine)663 _TranslationTable *_DataSet::CheckCompatibility(_SimpleList const &ref,
664                                                 char concatOrCombine) {
665   _DataSet *currentSet = (_DataSet *)dataSetList(ref.Element(0));
666 
667   _TranslationTable *theEnd = new _TranslationTable(*(currentSet->theTT));
668 
669   long refNo =
670       concatOrCombine ? currentSet->NoOfSpecies() : currentSet->NoOfColumns();
671   char emptyStringChar = theEnd->GetSkipChar();
672 
673   for (long k = 1; k < ref.lLength; k++) {
674     currentSet = (_DataSet *)dataSetList(ref.Element(k));
675 
676     _TranslationTable *tryMe = theEnd->MergeTables(currentSet->theTT);
677 
678     if (tryMe) {
679       if (emptyStringChar) {
680         DeleteObject(theEnd);
681         theEnd = tryMe;
682         continue;
683       } else {
684         if ((concatOrCombine && (currentSet->NoOfSpecies() == refNo)) ||
685             (!concatOrCombine && (currentSet->NoOfColumns() == refNo))) {
686           DeleteObject(theEnd);
687           theEnd = tryMe;
688           continue;
689         }
690       }
691     }
692     _String warningMessage("The data set ");
693     warningMessage =
694         warningMessage &
695         ((_String *)dataSetNamesList(ref.Element(k)))->Enquote() &
696         _String(" was found incompatible with one of the following data sets ");
697     for (long i = 0; i < k; i++) {
698       if (k) {
699         warningMessage = warningMessage & ", ";
700       }
701       warningMessage = warningMessage &
702                        ((_String *)dataSetNamesList(ref.Element(k)))->Enquote();
703     }
704     HandleApplicationError(warningMessage);
705     DeleteObject(tryMe);
706     DeleteObject(theEnd);
707     return nil;
708   }
709 
710   return theEnd;
711 }
712 
713 //_________________________________________________________
714 
HarvestFrequencies(unsigned char unit,unsigned char atom,bool posSpec,_SimpleList & hSegmentation,_SimpleList & vSegmentation,bool countGaps) const715 _Matrix * _DataSet::HarvestFrequencies (unsigned char unit, unsigned char atom, bool posSpec, _SimpleList& hSegmentation, _SimpleList& vSegmentation, bool countGaps) const {
716 
717     if (hSegmentation.empty () || vSegmentation.countitems() < unit) { // revert to default (all data)
718         if (hSegmentation.empty ()) {
719             hSegmentation.Populate (NoOfSpecies(),0,1);
720         }
721         if (vSegmentation.countitems () <unit) {
722             vSegmentation.Clear();
723             vSegmentation.Populate (GetNoTypes(),0,1);
724         }
725     }
726 
727     if (atom == 0 || unit%atom > 0) { // 20120814 SLKP: changed this behavior to throw errors
728         HandleApplicationError (_String("Atom must be non-zero and divide unit, had ") & _String ((long)unit) & "/" & _String ((long)atom));
729         return new _Matrix (1,1);
730     }
731 
732     _Matrix   *  out = new _Matrix (ComputePower (theTT->LengthOfAlphabet(), atom),
733                                     posSpec?unit/atom:1,
734                                     false,
735                                     true);
736 
737     long     positions  =   unit/atom,
738     static_store [HYPHY_SITE_DEFAULT_BUFFER_SIZE];
739 
740     _String unit_for_counting ((unsigned long)atom);
741 
742     for (unsigned long site_pattern = 0UL; site_pattern + unit <= vSegmentation.lLength;  site_pattern +=unit) { // loop over the set of segments
743         // make sure the partition is kosher
744 
745         /*
746          if (site_pattern + unit > vSegmentation.lLength) {
747             break;
748         }
749         */
750 
751         for (unsigned long primary_site = site_pattern; primary_site < site_pattern+unit; primary_site += atom) {
752 
753             long   index_in_pattern = (primary_site-site_pattern)/atom;
754 
755             for (unsigned long sequence_index = 0; sequence_index <hSegmentation.lLength; sequence_index ++) {
756                 // loop down each column
757 
758                 unsigned long mapped_sequence_index = hSegmentation.list_data[sequence_index];
759                 // build atomic probabilities
760 
761                 for (unsigned long m = 0UL; m<atom; m++ ) {
762                     unit_for_counting.set_char (m, (*this)(vSegmentation.list_data[primary_site+m],mapped_sequence_index,atom));
763                 }
764 
765                 long resolution_count = theTT->MultiTokenResolutions(unit_for_counting, static_store, countGaps);
766 
767                 if (resolution_count > 0UL) {
768 
769                     hyFloat    normalized = 1./resolution_count;
770 
771                     for (long resolution_index = 0UL; resolution_index < resolution_count; resolution_index ++) {
772                         out->theData[posSpec? static_store[resolution_index]*positions+index_in_pattern: static_store[resolution_index]] += normalized;
773                     }
774                 }
775             }
776         }
777     }
778 
779     //scale the matrix now
780 
781     unsigned long row_count    = out->GetHDim(),
782     column_count = out->GetVDim();
783 
784     for (unsigned long column =0UL; column < column_count; column++) { // normalize each _column_ to sum to 1.
785         hyFloat sum = 0.0;
786 
787         for (unsigned long row = 0UL; row < row_count; row++) {
788             sum += out->theData [row*column_count + column];
789         }
790 
791         for (unsigned long row = 0UL; row < row_count; row++) {
792             out->theData [row*column_count + column] /= sum;
793         }
794     }
795 
796 
797     return out;
798 }
799 
800 
801 
802 
803 //_______________________________________________________________________
804 
ProcessPartition(_String const & input2,_SimpleList & target,bool isVertical,int unit_length,_SimpleList const * additionalFilter,_SimpleList const * otherDimension,_String const * scope) const805 void    _DataSet::ProcessPartition (_String const & input2 , _SimpleList & target , bool isVertical, int unit_length, _SimpleList const* additionalFilter, _SimpleList const* otherDimension, _String const* scope) const {
806     // TODO SLKP : 20170928 this needs serious cleanup and testing
807 
808     if (input2.empty()) {
809         return;
810     }
811     // decide if the input is an enumeration or a formula
812     long totalLength;
813 
814     if (additionalFilter) {
815         totalLength = additionalFilter->countitems();
816     } else {
817         totalLength = isVertical?theMap.countitems():noOfSpecies;
818     }
819 
820     _String input (input2);
821 
822     if (!input.IsALiteralArgument(true)) { // not a literal argument
823 
824         _Formula fmla, lhs;
825         _FormulaParsingContext fpc;
826         fpc.setScope (scope);
827 
828         long     outcome = Parse (&fmla, input, fpc,&lhs);
829 
830         if (outcome!=HY_FORMULA_EXPRESSION) {
831             HandleApplicationError(input.Enquote() & _String(" is an invalid partition specification"));
832             return;
833         }
834         HBLObjectRef   fV = fmla.Compute();
835         if (fV && fV->ObjectClass()==STRING) {
836             ProcessPartition (((_FString*)fV)->get_str().Enquote(), target, isVertical, unit_length, additionalFilter, nil, scope);
837         } else {
838             _DataSet::MatchIndices (fmla, target, isVertical, totalLength, scope);
839         }
840     } else { // an explicit enumeration or a regular expression
841 
842         // check to see if argument is a callback
843 
844         bool is_regexp   = input (0) =='/' && input (-1) == '/';
845         long is_hbl_function = -1L;
846 
847         if (!is_regexp) {
848             is_hbl_function = hyphy_global_objects::FindBFFunctionName (input);
849             if (is_hbl_function >= 0) {
850                 if (GetBFFunctionArgumentCount (is_hbl_function) !=  2) {
851                     HandleApplicationError(input.Enquote() & _String(" is not a valid callback function: must have two arguments (name, sequence for sites; string, frequencies for sites)"));
852                     return;
853                 }
854 
855             }
856         }
857 
858         if (is_regexp || is_hbl_function >= 0L) {
859             // a regular expression or a callback
860             regex_t*   regex = nil;
861             _Formula   filter_formula;
862 
863             if (is_regexp) {
864                 input.Trim(1,input.length()-2);
865                 int   errCode;
866                 regex = _String::PrepRegExp (input, errCode, true);
867                 if (errCode) {
868                     HandleApplicationError(_String::GetRegExpError(errCode));
869                     return;
870                 }
871             }
872             // now set do the matching
873             // using only the sites that are specced in the additionalFilter
874 
875             if (!isVertical) { // partitioning sequences
876 
877                _FString * string_object = nil,
878                         * string_name = nil;
879                if (!is_regexp) {
880                         filter_formula.GetList() < new _Operation()
881                                                  < new _Operation()
882                                                  < new _Operation(kEmptyString,-is_hbl_function-1L);
883 
884                    string_object = new _FString;
885                    string_name   = new _FString;
886                }
887 
888                 const long loop_limit = additionalFilter ? additionalFilter->lLength : totalLength;
889 
890                 for (long specCount = 0L; specCount < loop_limit; specCount++) {
891                     _String pattern ((unsigned long)theMap.countitems());
892                     long    seqPos = additionalFilter ? additionalFilter->Element (specCount) : specCount;
893 
894 
895                     if (otherDimension) {
896                         for (long seqSlider = 0L; seqSlider < otherDimension->lLength; seqSlider ++) {
897                             pattern.set_char(seqSlider, GetSite(otherDimension->Element(seqSlider))->get_char (seqPos));
898                         }
899                     }
900                     else {
901                         for (long seqSlider = 0L; seqSlider < theMap.lLength; seqSlider ++) {
902                             pattern.set_char(seqSlider, GetSite(seqSlider)->get_char (seqPos));
903                         }
904                     }
905 
906                     if (is_regexp) {
907                         if (pattern.RegExpMatch (regex, 0L).countitems()) {
908                             target << specCount;
909                         }
910                     } else {
911                         string_object->SetStringContent(new _StringBuffer (pattern));
912                         string_name->SetStringContent  (new _StringBuffer (*GetSequenceName(seqPos)));
913                         filter_formula.GetIthTerm(1)->SetNumber(string_object);
914                         filter_formula.GetIthTerm(0)->SetNumber(string_name);
915                         if (!CheckEqual(0.,filter_formula.Compute()->Value())) {
916                             target << specCount;
917                         }
918                     }
919                 }
920 
921                 if (!is_regexp) {
922                     filter_formula.GetIthTerm(0)->SetNumber(nil);
923                     filter_formula.GetIthTerm(1)->SetNumber(nil);
924                     DeleteObject (string_object);
925                     DeleteObject (string_name);
926                 }
927             } else {
928 
929                 auto map_site = [] (const _Site* site, _String& buffer, _SimpleList const * mapper) -> void {
930                     mapper->Each ([&] (long value, unsigned long index) -> void {
931                         buffer.set_char (index, site->char_at (value));
932                     });
933                 };
934 
935                 bool         *eligibleMarks = nil;
936 
937 
938 
939                 if (is_regexp) {
940                     eligibleMarks = new bool[lLength];
941                     if (additionalFilter) {
942                         InitializeArray(eligibleMarks, lLength, false);
943                         for (long siteIndex = 0; siteIndex < additionalFilter->lLength; siteIndex ++) {
944                             eligibleMarks[theMap.list_data[additionalFilter->list_data[siteIndex]]] = true;
945                         }
946                     }
947                     else {
948                         InitializeArray(eligibleMarks, lLength, true);
949                     }
950 
951                     _String     *tempString = nil;
952                     _SimpleList matches;
953                     if (otherDimension) {
954                         tempString = new _String (otherDimension->countitems());
955                     }
956 
957                     for (long siteCounter = 0; siteCounter < lLength; siteCounter ++)
958                         if (eligibleMarks[siteCounter]) {
959                             matches.Clear();
960                             if (otherDimension) {
961                                 map_site ((_Site*)GetItem(siteCounter), *tempString, otherDimension);
962                                 matches = tempString->RegExpMatch (regex, 0L);
963                             } else {
964                                 matches = ((_Site**)list_data)[siteCounter]->RegExpMatch (regex, 0L);
965                             }
966                             if (matches.empty()) {
967                                 eligibleMarks[siteCounter] = false;
968                             }
969                         }
970 
971                     DeleteObject (tempString);
972                     if (additionalFilter) {
973                         for (long afi = 0; afi < additionalFilter->lLength; afi++) {
974                             if (eligibleMarks[theMap.list_data[additionalFilter->list_data[afi]]]) {
975                                 target << afi;
976                             }
977                         }
978                     } else {
979                         theMap.Each  ([&target, eligibleMarks] (long site_pattern, unsigned long index) -> void {
980                             if (eligibleMarks [site_pattern]) {
981                                 target << index;
982                             }
983                         });
984                     }
985                 } else {
986 
987 
988                     long freq_dimension = ComputePower (GetCharDimension(), unit_length);
989 
990                     if (freq_dimension > 0xffff) {
991                         HandleApplicationError("The dimension of the character space is too high for callback filtering");
992                         return;
993                     }
994                     eligibleMarks = new bool[theMap.lLength];
995 
996                     if (additionalFilter) {
997                         InitializeArray(eligibleMarks, theMap.lLength, false);
998                         for (long siteIndex = 0; siteIndex < additionalFilter->lLength; siteIndex ++) {
999                             eligibleMarks[additionalFilter->list_data[siteIndex]] = true;
1000                         }
1001                     }
1002                     else {
1003                         InitializeArray(eligibleMarks, theMap.lLength, true);
1004                     }
1005 
1006                     filter_formula.GetList() < new _Operation()
1007                                              < new _Operation()
1008                                              < new _Operation(kEmptyString,-is_hbl_function-1L);
1009 
1010                     _Matrix * strings     = nil,
1011                             * frequencies = nil;
1012 
1013 
1014 
1015                     _List string_list,
1016                           string_storage;
1017 
1018 
1019                     if (otherDimension) {
1020                         for (int i = 0; i < unit_length; i++) {
1021                             string_storage < new _String (otherDimension->countitems());
1022                         }
1023                     }
1024 
1025                     _SimpleList sites (unit_length, 0, 0),
1026                                 sequences;
1027 
1028                     if (otherDimension) {
1029                         sequences = *otherDimension;
1030                     } else {
1031                         sequences.Populate((unsigned long)NoOfSpecies(), 0, 1);
1032                     }
1033 
1034                     for (long siteCounter = 0L; siteCounter + unit_length <= theMap.lLength; siteCounter += unit_length) {
1035                         long unit_space = 0L;
1036                         string_list.Clear();
1037                         for (; unit_space < unit_length; unit_space++) {
1038                             if (eligibleMarks[siteCounter + unit_space]) {
1039                                 sites[unit_space] = siteCounter + unit_space;
1040                                 if (otherDimension) {
1041                                     map_site ((_Site*)GetSite(siteCounter + unit_space), *(_String*)string_storage.GetItem (unit_space), otherDimension);
1042                                 } else {
1043                                     string_list << GetSite (siteCounter + unit_space);
1044                                 }
1045                             } else {
1046                                 break;
1047                             }
1048                         }
1049 
1050                         if (unit_space == unit_length) {
1051                             //eligibleMarks[siteCounter + unit_space]
1052                             BatchDelete(strings, frequencies);
1053                             if (otherDimension) {
1054                                 strings = new _Matrix (string_storage);
1055                             } else {
1056                                 strings = new _Matrix (string_list);
1057                             }
1058 
1059                             filter_formula.GetIthTerm(0)->SetNumber(strings);
1060 
1061                             //(unsigned char unit, unsigned char atom, bool posSpec, _SimpleList& hSegmentation, _SimpleList& vSegmentation, bool countGaps)
1062                             frequencies = HarvestFrequencies (unit_space, unit_space, false, sequences, sites, false);
1063                             filter_formula.GetIthTerm(1)->SetNumber(frequencies);
1064 
1065                             if (!CheckEqual(0.,filter_formula.Compute()->Value())) {
1066                                 continue;
1067                             }
1068                        }
1069 
1070                         for (unit_space = 0L; unit_space < unit_length; unit_space++) {
1071                             eligibleMarks[siteCounter + unit_space] = false;
1072                         }
1073                     }
1074 
1075                     theMap.Each  ([&target, eligibleMarks] (long site_pattern, unsigned long index) -> void {
1076                         if (eligibleMarks [index]) {
1077                             target << index;
1078                         }
1079                     });
1080                     // strings && frequencies will be cleaned up by the destructor of filter_formula
1081                 }
1082 
1083                 delete [] eligibleMarks;
1084             }
1085             if (regex) {
1086                 _String::FlushRegExp (regex);
1087             }
1088         } else {
1089             input = input.KillSpaces ();
1090             // now process the string
1091             long count = 0L,anchor;
1092 
1093             _SimpleList numbers,
1094             links;
1095 
1096             numbers.RequestSpace (1024);
1097             links.RequestSpace (1024);
1098 
1099             // first check if it is has a comb filter
1100 
1101             if ( input (0) =='<' && input (-1) =='>') {
1102                 for (count=1; count<input.length()-1; count++) {
1103                     if (input.char_at(count) != '0') {
1104                         numbers<<count-1;
1105                     }
1106                 }
1107                 if (numbers.countitems()) {
1108                     long k = input.length()-2; // step size
1109                     anchor = 0;
1110                     if (totalLength == -1) {
1111                         totalLength = theMap.lLength;
1112                     }
1113                     while (anchor<totalLength-k) {
1114                         for (count = 0; count< numbers.lLength; count++) {
1115                             target<<anchor+numbers.list_data[count];
1116                         }
1117                         anchor+=k;
1118                     }
1119                     if ( (k=totalLength-1-anchor) ) {
1120                         for (count = 0; count< numbers.lLength; count++) {
1121                             if (numbers.list_data[count]>k) {
1122                                 break;
1123                             }
1124                             target<<anchor+numbers.list_data[count];
1125                         }
1126                     }
1127                     return;
1128 
1129                 }
1130             }
1131 
1132             while (count<input.length()) {
1133                 anchor = count;
1134 
1135                 for (; count<input.length() && isdigit(input.char_at (count)); count++) ;
1136 
1137                 long    aNumber = (input.Cut (anchor,count-1)).to_long();
1138 
1139                 if (aNumber < 0) {
1140                     _String warnMsg ("A negative number was found in partition specification: ");
1141                     ReportWarning (warnMsg & input.Cut (0,anchor-1) & '?' & input.Cut (anchor,-1));
1142                     target.Clear();
1143                     return;
1144                 }
1145                 numbers<< aNumber;
1146 
1147                 char current_char = input.char_at (count);
1148 
1149                 if (current_char == '<' || current_char == '>') {
1150                     ReportWarning (_String  ("A comb partition cannot be combined with other types. The entire partition is reset to first..last") & input.Cut (0,anchor-1) & '?' & input.Cut (anchor,-1));
1151                     target.Clear();
1152                     return;
1153                 }
1154 
1155                 if (current_char == '&') {
1156                     links << numbers.lLength;
1157                 }
1158 
1159                 // TODO SLKP 20171001 this needs to be checked for correctness
1160                 if (current_char == ',' || count == input.length()) { // wrap it up dude
1161                     if (numbers.countitems() == 1) {
1162                         target<<numbers(0);
1163                     } else {
1164                         if (links.empty()) {
1165                             if (numbers[0]>numbers[1]) { // backward order
1166                                 for (long k = numbers[0]; k>=numbers[1]; k--) {
1167                                     target<<k;
1168                                 }
1169                             } else {
1170                                 for (long k = numbers[0]; k<=numbers[1]; k++) {
1171                                     target<<k;
1172                                 }
1173                             }
1174                         } else {
1175                             // linked locations
1176                             if (links.countitems() != (numbers.countitems()-2) / 2) {
1177                                 ReportWarning ("A part of the partition specification has not been understood and is being skipped.");
1178                                 target.Clear();
1179                                 return;
1180                             } else {
1181                                 _SimpleList signs;
1182                                 signs<<(numbers(0)<numbers(1)?1:-1);
1183                                 for (long k = 0; k<links.lLength; k+=2) {
1184                                     signs<<(numbers(links(k))<numbers(links(k+1))?1:-1);
1185                                 }
1186 
1187                                 for (long k=numbers(0), l=0 ; signs(0)*k<=signs(0)*numbers(1); k+=signs(0), l++) {
1188                                     target<<numbers(0)+l*signs(0);
1189                                     for (long m=0; m<links.lLength; m++) {
1190                                         target<<numbers(links(m))+l*signs(m+1);
1191                                     }
1192                                 }
1193                             }
1194                         }
1195                     }
1196                     numbers.Clear();
1197                     links.Clear();
1198                 }
1199                 count++;
1200             }
1201         }
1202     }
1203 }
1204 
1205 //_________________________________________________________
1206 
Concatenate(_SimpleList const & ref)1207 _DataSet *_DataSet::Concatenate(_SimpleList const &ref)
1208 
1209 // concatenates (adds columns together) several datasets
1210 // in case the number of species in the datasets are different the deficiencies
1211 // will be padded by omission symbols in case translation tables are different,
1212 // they will be merged, provided it can be done, otherwise the incompatible
1213 // datasets will be ignored during this operation.
1214 
1215 {
1216   _TranslationTable *jointTable = CheckCompatibility(ref, 1);
1217   if (!jointTable) {
1218     return new _DataSet;
1219   }
1220 
1221   _DataSet *bigDataSet = new _DataSet;
1222 
1223   bigDataSet->theTT = jointTable;
1224 
1225   // pass one - determine the max max number of species present and what dataset
1226   // are they coming from
1227 
1228   long maxSpecies = 0, maxDataSet = 0, siteIndex;
1229 
1230   _DataSet *currentSet;
1231 
1232   char emptyStringSlot = jointTable->GetSkipChar();
1233 
1234   for (long i = 0; i < ref.lLength; i++) {
1235     currentSet = (_DataSet *)dataSetList(ref(i));
1236 
1237     long specCount = currentSet->NoOfSpecies(),
1238          siteCount = currentSet->NoOfColumns();
1239 
1240     if (specCount > maxSpecies) {
1241       maxSpecies = specCount;
1242       maxDataSet = i;
1243     }
1244     for (long j = 0; j < siteCount; j++) {
1245       bigDataSet->AddSite((*currentSet)(j, 0, 1));
1246     }
1247   }
1248 
1249   for (long k = 1; k < maxSpecies; k++) {
1250     siteIndex = 0;
1251     for (long i = 0; i < ref.lLength; i++) {
1252       currentSet = (_DataSet *)dataSetList(ref.list_data[i]);
1253 
1254       long cns = currentSet->NoOfSpecies(), cnc = currentSet->NoOfColumns();
1255 
1256       if (cns <= k)
1257         for (long j = 0; j < cnc; j++, siteIndex++) {
1258           bigDataSet->Write2Site(siteIndex, emptyStringSlot);
1259         }
1260       else
1261         for (long j = 0; j < cnc; j++, siteIndex++) {
1262           bigDataSet->Write2Site(siteIndex, (*currentSet)(j, k, 1));
1263         }
1264     }
1265   }
1266 
1267   currentSet = (_DataSet *)dataSetList(ref(maxDataSet));
1268   for (long i = 0L; i < maxSpecies; i++) {
1269     bigDataSet->AddName(*currentSet->GetSequenceName(i));
1270   }
1271 
1272   bigDataSet->Finalize();
1273   bigDataSet->SetNoSpecies(maxSpecies);
1274   return bigDataSet;
1275 }
1276 
1277 //_________________________________________________________
1278 
Combine(_SimpleList const & ref)1279 _DataSet *_DataSet::Combine(_SimpleList const &ref) {
1280 
1281   // combines (adds rows together) several datasets
1282   // in case the number of species in the datasets are different the
1283   // deficiencies will be padded by omission symbols in case translation tables
1284   // are different, they will be merged, provided it can be done, otherwise the
1285   // incompatible datasets will be ignored during this operation.
1286 
1287   _TranslationTable *joint_table = CheckCompatibility(ref, 0);
1288 
1289   if (!joint_table) {
1290     return new _DataSet;
1291   }
1292 
1293   _DataSet *combined_data = new _DataSet;
1294   combined_data->theTT = joint_table;
1295 
1296   // pass one - determine the max max number of sites present and what dataset
1297   // are they coming from
1298 
1299   unsigned long max_sites = 0UL, total_species_count = 0UL;
1300 
1301   char emptyStringSlot = joint_table->GetSkipChar();
1302 
1303   for (unsigned long set_index = 0UL; set_index < ref.lLength; set_index++) {
1304     _DataSet const *current_data_set =
1305         (_DataSet const *)dataSetList(ref.Element(set_index));
1306     StoreIfGreater(max_sites, current_data_set->NoOfColumns());
1307     total_species_count += current_data_set->NoOfSpecies();
1308   }
1309 
1310   for (unsigned long set_index = 0UL; set_index < ref.lLength; set_index++) {
1311     _DataSet const *current_data_set =
1312         (_DataSet const *)dataSetList(ref.Element(set_index));
1313     unsigned long sites_in_this_set = current_data_set->NoOfColumns(),
1314                   sequences_in_this_set = current_data_set->NoOfSpecies();
1315 
1316     for (unsigned long seq_index = 0UL; seq_index < sequences_in_this_set;
1317          seq_index++) {
1318       combined_data->AddName(*current_data_set->GetSequenceName(seq_index));
1319       if (seq_index == 0UL && set_index == 0UL) {
1320         /* use AddSite write out the first sequence */
1321         unsigned long site_index = 0UL;
1322         for (site_index = 0UL; site_index < sites_in_this_set; site_index++) {
1323           combined_data->AddSite((*current_data_set)(site_index, 0UL, 1));
1324         }
1325         for (; site_index < max_sites; site_index++) {
1326           combined_data->AddSite(emptyStringSlot);
1327         }
1328       } else {
1329         /* use Write2Site to create subsequence sequences */
1330         unsigned long site_index = 0UL;
1331         for (site_index = 0UL; site_index < sites_in_this_set; site_index++) {
1332           combined_data->Write2Site(
1333               site_index, (*current_data_set)(site_index, seq_index, 1));
1334         }
1335         for (; site_index < max_sites; site_index++) {
1336           combined_data->Write2Site(site_index, emptyStringSlot);
1337         }
1338       }
1339     }
1340   }
1341 
1342   combined_data->Finalize();
1343   combined_data->SetNoSpecies(total_species_count);
1344   return combined_data;
1345 }
1346 
1347   //_______________________________________________________________________
1348 
GetSequenceCharacters(long seqID) const1349 _String*        _DataSet::GetSequenceCharacters (long seqID)  const{
1350 
1351   unsigned long        upTo = NoOfColumns();
1352   _StringBuffer * aSequence = new _StringBuffer (upTo);
1353 
1354   if (seqID >= 0L && seqID < noOfSpecies) {
1355     for (unsigned long k2=0UL; k2<upTo; k2++) {
1356       (*aSequence) << GetSite (k2)->char_at(seqID);
1357     }
1358   }
1359   aSequence->TrimSpace ();
1360   return aSequence;
1361 }
1362 
1363 //_________________________________________________________
1364 
StoreADataSet(_DataSet * ds,_String * setName)1365 bool    StoreADataSet (_DataSet* ds, _String* setName) {
1366     if (!setName->IsValidIdentifier (fIDAllowCompound | fIDAllowFirstNumeric)) {
1367         HandleApplicationError (setName->Enquote() & " is not a valid identifier while constructing a DataSet");
1368         return false;
1369     }
1370 
1371     long type = HY_BL_DATASET, index;
1372     _DataSet * existing_ds = (_DataSet * )_HYRetrieveBLObjectByNameMutable (*setName, type, &index, false, false);
1373 
1374 
1375     if (! existing_ds) {
1376         dataSetNamesList << setName;
1377         dataSetList < ds;
1378     } else {
1379 
1380 
1381         bool isDifferent = existing_ds->NoOfSpecies () != ds->NoOfSpecies() ||
1382         existing_ds->NoOfColumns () != ds->NoOfColumns() ||
1383         existing_ds->NoOfUniqueColumns () != ds->NoOfUniqueColumns() ||
1384         (existing_ds->GetTT () != ds->GetTT() && !(*existing_ds->GetTT () == *ds->GetTT()));
1385 
1386         for (AVLListXLIteratorKeyValue filter_key_value : ObjectIndexer (HY_BL_DATASET_FILTER)) {
1387             _DataSetFilter * filter = (_DataSetFilter*) filter_key_value.get_object();
1388             if (filter->GetData() == existing_ds) {
1389                 if (isDifferent) {
1390                     ReportWarning (_String("Overwriting dataset '") & *setName & "' caused DataSetFilter " & GetFilterName(filter_key_value.get_index())->Enquote('\'') & " to be deleted");
1391                     DeleteDataFilter(filter_key_value.get_index());
1392                 } else {
1393                     filter->SetData(ds);
1394                 }
1395             }
1396         }
1397 
1398         dataSetList.Replace(index,ds,false);
1399     }
1400 
1401     CheckReceptacleAndStore (*setName&".mapping",kEmptyString,false, new _MathObject, false);
1402 
1403     if (hy_env::EnvVariableTrue(hy_env::normalize_sequence_names)) {
1404         _List _id_mapping;
1405         _AVLListXL id_mapping (&_id_mapping);
1406         bool       did_something = false;
1407 
1408         for (unsigned long i = 0UL; i < ds->NoOfSpecies(); i ++) {
1409             _String * old_name = new _String (*ds->GetSequenceName (i));
1410             if (! old_name->IsValidIdentifier(fIDAllowFirstNumeric) ) {
1411                 *ds->GetSequenceName (i) = ds->GetSequenceName (i)->ConvertToAnIdent(fIDAllowFirstNumeric);
1412                 did_something = true;
1413             }
1414             if (id_mapping.Find (ds->GetSequenceName (i)) >= 0) {
1415                 _String new_name (*ds->GetSequenceName (i));
1416                 long suffix = 1L;
1417                 do {
1418                     new_name = *ds->GetSequenceName (i) & "_" & suffix++;
1419                 } while (id_mapping.Find (&new_name) >= 0);
1420                 *ds->GetSequenceName (i) = new_name;
1421                 did_something = true;
1422             }
1423 
1424             ds->GetSequenceName (i)->AddAReference();
1425             id_mapping.Insert (ds->GetSequenceName (i), (long)old_name, false, false);
1426         }
1427 
1428         if (did_something) {
1429             _AssociativeList * mapping = new _AssociativeList();
1430 
1431             for (AVLListXLIteratorKeyValue filter_key_value : AVLListXLIterator (&id_mapping)) {
1432                 //printf ("%d => %s\n", filter_key_value.get_index(), ((_String*)filter_key_value.get_object())->get_str());
1433                 mapping->MStore(*(_String *)id_mapping.Retrieve (filter_key_value.get_index()), *(_String*)filter_key_value.get_object());
1434             }
1435 
1436             CheckReceptacleAndStore (*setName&".mapping",kEmptyString,false, mapping, false);
1437         }
1438     }
1439 
1440     CheckReceptacleAndStore (*setName&".species",kEmptyString,false, new _Constant (ds->NoOfSpecies()), false);
1441     CheckReceptacleAndStore (*setName&".sites",kEmptyString,false, new _Constant (ds->NoOfColumns()), false);
1442     CheckReceptacleAndStore (*setName&".unique_sites",kEmptyString,false, new _Constant (ds->NoOfUniqueColumns()), false);
1443 
1444     return true;
1445 }
1446 
1447 //_________________________________________________________
1448 //_________________________________________________________
1449 // reading the data set file in here
1450 
1451 //_________________________________________________________
checkTTStatus(FileState * fs)1452 void    checkTTStatus (FileState* fs) {// check whether the translation table needs to be refreshed}
1453     if (fs->translationTable == &hy_default_translation_table) {
1454         fs->translationTable =  (_TranslationTable*)hy_default_translation_table.makeDynamic();
1455     }
1456 }
1457 //_________________________________________________________
processCommand(_String * s,FileState * fs)1458 void    processCommand (_String * s, FileState*fs) {
1459 
1460     static const _List _CommandList (
1461         new _String ("BASESET"),
1462         new _String ("FORMAT"),
1463         new _String ("RAWLINE"),
1464         new _String ("REPEAT"),
1465         new _String ("TOKEN")
1466     );
1467 
1468     static const _String  kBase20  ("BASE20"),
1469                           kPHYLIPi ("PHYLIPI"),
1470                           kPHYLIPs ("PHYLIPS"),
1471                           kRAW     ("RAW");
1472 
1473 
1474     long f = -1,
1475          command_index;
1476 
1477     for (command_index=0L; command_index<_CommandList.countitems(); ++command_index) {
1478         f = s->Find (*(_String*)_CommandList.GetItem (command_index));
1479         if ( f!= kNotFound) {
1480             break;
1481         }
1482     }
1483 
1484     try {
1485         if (f==-1) { // unrecognized command
1486             return;
1487         } else {
1488             // trim the string
1489             //s->Trim (f+((_String*)CommandList(i))->Length(),-1);
1490 
1491             f = s->Find (":", f + 1L + ((_String*)_CommandList.GetItem (command_index))->length());
1492 
1493             if (f == kNotFound) { // poorly formed command
1494                 throw (s->Enquote('[', ']') & " was not of the form COMMAND : DATA");
1495             }
1496 
1497 
1498             if (command_index >= 1 &&  command_index<=3 ) {
1499                 long start = f + 1;
1500                 long end = s->ExtractEnclosedExpression(start, '"', '"', 0);
1501 
1502                 if (end == kNotFound || end - start <= 2L) {
1503                     throw (s->Enquote('[', ']') & " was not of the form COMMAND : \"DATA\" (missing quotes)");
1504                 }
1505                 s->Trim (start + 1L, end - 1L);
1506            } else {
1507                s->Trim (f + 1L, kStringEnd);
1508            }
1509 
1510            // 's' should now contain only the payload of the command
1511 
1512             switch (command_index) {
1513                     char c;
1514 
1515                 case 4: {// new token
1516                     checkTTStatus (fs);
1517                     // attempt to extract a token. Looking for (e.g):   "c" = "AC"
1518 
1519                     _SimpleList matches = s->RegExpMatch("\\\"([a-z,A-Z])\\\"\\ *=\\ *\\\"([a-z,A-Z]+)\\\"", false, true);
1520                     if (matches.countitems() == 6) {
1521                         fs->translationTable->AddTokenCode (matches.get (2),s->Cut (matches.get(3), matches.get(4)));
1522                     } else {
1523                         throw (s->Enquote('[', ']') & " was not of the form \"token\"=\"translation\"");
1524                     }
1525                 }
1526                 break;
1527 
1528 
1529                 case 0: { // new code set, e.g  "ACGU"
1530                     checkTTStatus(fs);
1531                     // erase previous char definitions
1532                     fs->translationTable->translationsAdded.Clear();
1533                     fs->translationTable->tokensAdded = "";
1534                     if (*s != kBase20) {
1535                         long start = 0;
1536                         long end = s->ExtractEnclosedExpression(start, '"', '"', 0);
1537                         if (end == kNotFound || end - start <= 3L) {
1538                              throw (s->Enquote('[', ']') & " was not of the form \"at least two letters\"");
1539                         }
1540                         // TODO : there is no check that baseset is actually valid (e.g. no duplicate characters etc)
1541                         fs->translationTable->AddBaseSet (s->Cut (start + 1, end - 1));
1542                     } else {
1543                         fs->translationTable->AddBaseSet (kEmptyString);
1544                         fs->translationTable->baseLength = 20;
1545                     }
1546                 }
1547                 break;
1548 
1549                 case 1: //FORMAT
1550                     if (*s== kPHYLIPi) { // PHYLIP Interleaved
1551                         fs->fileType = 1;
1552                         fs->interleaved = TRUE;
1553                     } else if (*s== kPHYLIPs) { // PHYLIP sequential
1554                         fs->fileType = 1;
1555                         fs->interleaved = FALSE;
1556                     }
1557                     if (*s== kRAW) { // RAW Sequential Data (as in NEXUS)
1558                         fs->fileType = 2;
1559                         fs->interleaved = FALSE;
1560                     }
1561                     fs->autoDetect = false;
1562                     break;
1563 
1564                 case 3: // REPEAT CHAR
1565                     fs->repeat = s->get_char(0);
1566                     break;
1567 
1568                 case 2: // RAWLINE template e.g 1,-1 skips one word at the beginning and one word at the end
1569                     _List chips (s,',');
1570                     chips.ForEach([&] (BaseRef number, unsigned long index) -> void {
1571                         fs->rawLinesFormat<< ((_String*)number)->to_long();
1572                     });
1573                     break;
1574 
1575             }
1576         }
1577     } catch (const _String& warning) {
1578         ReportWarning (warning);
1579     }
1580 }
1581 //_________________________________________________________
1582 
FilterRawString(_String & s,FileState * fs,_DataSet & ds)1583 void    FilterRawString (_String& s, FileState* fs, _DataSet & ds) {
1584     // TODO: SLKP 20180803 this needs to be tested or deprecated.
1585     s.CompressSpaces();
1586     _List words (s.Tokenize(" "));
1587 
1588     long current_start = 0L,
1589          current_end   = (long)words.countitems () - 1L;
1590 
1591     fs->rawLinesFormat.Each([&] (long word, unsigned long idx) -> void {
1592         if (word > 0L) {
1593             current_start += word;
1594         } else {
1595             if (word < 0L) {
1596                 current_end += word;
1597             } else {
1598                 if (current_start < current_end) {
1599                     ds.AddName (*((_String*)words.GetItem (current_start)));
1600                     current_start ++;
1601                 }
1602             }
1603         }
1604     });
1605 
1606     if (current_start >= current_end) {
1607         s = kEmptyString;
1608     } else {
1609         s = words.Join(" ", current_start, current_end);
1610     }
1611 }
1612 //_________________________________________________________________________________________________
1613 
1614 
1615 //_________________________________________________________________________________________________
1616 
ProcessTree(FileState * fState,hyFile * f,_StringBuffer & CurrentLine)1617 void    ProcessTree (FileState *fState, hyFile * f, _StringBuffer& CurrentLine) {
1618 
1619     // TODO SLKP 20180921 this does extra work to read in the tree string multiple times;
1620     // the solution is to have a proper buffer wrapper, and to
1621 
1622     class _MultilineBuffer : public _StringBuffer {
1623         public:
1624 
1625         _MultilineBuffer (_String const& current_line, FileState *fs, hyFile* f) : _StringBuffer (current_line) {
1626             file_state = fs;
1627             file = f;
1628         }
1629 
1630         virtual char get_char(long index) {
1631             if (index >= 0L && index < s_length) {
1632                 return s_data[index];
1633             } else {
1634                 _StringBuffer  next_line;
1635                 ReadNextLine (file,&next_line,file_state, false);
1636                 if (next_line.nonempty()) {
1637                     *this << next_line;
1638                     return get_char (index);
1639                 }
1640             }
1641             return _String::default_return;
1642         }
1643 
1644         FileState *file_state;
1645         hyFile * file;
1646 
1647     };
1648 
1649 
1650     //_MultilineBuffer mlb (CurrentLine, fState, f);
1651 
1652     _StringBuffer * tree_string = new _StringBuffer (128L);
1653     long start_index = 0,
1654          end_index   = CurrentLine.ExtractEnclosedExpression(start_index, '(', ')', fExtractRespectQuote|fExtractRespectEscape);
1655 
1656     while (start_index == kNotFound || end_index == kNotFound) {
1657         _StringBuffer next_line;
1658         ReadNextLine (f,&next_line,fState, false);
1659         CurrentLine = CurrentLine & next_line;
1660         start_index = 0L;
1661         end_index   = CurrentLine.ExtractEnclosedExpression(start_index, '(', ')', fExtractRespectQuote|fExtractRespectEscape);
1662     }
1663 
1664     if (start_index == kNotFound || end_index == kNotFound) {
1665         ReportWarning (tree_string->Enquote() & " has mimatched '(' and ')'");
1666         DeleteObject (tree_string);
1667     } else {
1668         *tree_string << CurrentLine.Cut (start_index, end_index);
1669         tree_string->TrimSpace();
1670         CurrentLine.Trim (end_index + 1, kStringEnd);
1671         hy_env::EnvVariableSetNamespace(hy_env::data_file_tree, new HY_CONSTANT_TRUE, fState->theNamespace, false);
1672         hy_env::EnvVariableSetNamespace(hy_env::data_file_tree_string, new _FString (tree_string), nil, false);
1673     }
1674 }
1675 
1676 //_________________________________________________________
1677 
ProcessLine(_String & s,FileState * fs,_DataSet & ds)1678 long    ProcessLine (_String&s , FileState *fs, _DataSet& ds) {
1679     long sitesAttached = 0L;
1680 
1681     try {
1682         s.Each([&] (char letter, unsigned long i) -> void {
1683             letter = toupper(letter);
1684             if (fs->translationTable->IsCharLegal(letter)) { // go on
1685                 if (fs->curSpecies==0) { // add new column
1686                     ds.AddSite (letter);
1687                     sitesAttached++;
1688                 } else { //append to exisiting column
1689                          //if (c == fs->skip) continue;
1690                          // check to see if this species needs to be padded
1691 
1692                     if (letter == fs->repeat) {
1693                         if ( fs->curSite+sitesAttached >= ds.lLength) { // a dot not matched by a previously read character; ignore
1694                             throw sitesAttached;
1695                         }
1696 
1697                         letter = ((_Site*)(ds._List::operator () (fs->curSite+sitesAttached)))->get_char(0);
1698                         if (letter=='\0') {
1699                             letter = ((_Site*)(ds._List::operator ()
1700                                           (((_Site*)(ds._List::operator () (fs->curSite+sitesAttached)))->GetRefNo())))->get_char(0);
1701                         }
1702                     }
1703 
1704                     if (fs->curSite+sitesAttached+1>fs->totalSitesRead) {
1705                         // pad previous species to full length
1706                         _Site * newS = new _Site (fs->skip);
1707                         newS->AppendNCopies(fs->skip, fs->curSpecies-1L);
1708                         (*newS) << letter;
1709 
1710 
1711                         ds.theFrequencies << 1;
1712                         newS->SetRefNo(-1);
1713 
1714                         ds < newS;
1715                         fs->totalSitesRead++;
1716                     } else {
1717                         ds.Write2Site (fs->curSite+sitesAttached, letter);
1718                     }
1719 
1720                     sitesAttached++;
1721                 }
1722             }
1723         });
1724     } catch (long e) {
1725         return e;
1726     }
1727 
1728     // make sure that this species has enough data in it, and if not - pad it with '?'
1729 
1730     if ( fs->curSite+sitesAttached<fs->totalSitesRead && fs->interleaved) {
1731         // pad this species to full length
1732         for (long j = fs->curSite+sitesAttached; j<fs->totalSitesRead; j++) {
1733             ds.Write2Site (j, fs->skip);
1734         }
1735     }
1736     if (!fs->curSpecies) {
1737         fs->totalSitesRead+=sitesAttached;
1738     }
1739     return sitesAttached;
1740 }
1741 
1742 //_________________________________________________________
PadLine(FileState & fState,_DataSet & result)1743 void    PadLine (FileState& fState, _DataSet& result) { // make sure that there is enough data in this line
1744                                                       // and if not - "pad" it with '?''s
1745     if (fState.curSite<fState.totalSitesRead) // pad line if needed
1746         for (long j = fState.curSite; j<fState.totalSitesRead; j++) {
1747             result.Write2Site (j, fState.skip);
1748         }
1749 }
1750 
1751 //_________________________________________________________
ISelector(FileState & fState,_StringBuffer & CurrentLine,_DataSet & result)1752 void    ISelector (FileState& fState, _StringBuffer& CurrentLine, _DataSet& result) {
1753     if (fState.interleaved) { // interleaved file
1754         if (fState.curSpecies&&(!((fState.curSpecies)%fState.totalSpeciesExpected))) { // read a chunk of all species
1755             if (fState.totalSitesRead && !result.InternalStorageMode()) {
1756                 for (long i = fState.curSite; i<fState.totalSitesRead; i++) {
1757                     result.Compact(i);
1758                 }
1759 
1760                 result.ResetIHelper();
1761 
1762             }
1763             fState.curSite = fState.totalSitesRead;
1764             fState.curSpecies = 0;
1765             ProcessLine (CurrentLine, &fState, result);
1766             fState.curSpecies = 1;
1767             if (!fState.curSite) {
1768                 fState.totalSpeciesRead++;
1769             }
1770         } else {
1771             ProcessLine (CurrentLine, &fState, result);
1772             if (!fState.curSite) {
1773                 fState.totalSpeciesRead++;
1774             }
1775             fState.curSpecies++;
1776         }
1777     } else {
1778         if (fState.curSpecies+1<fState.totalSpeciesExpected) {
1779             fState.curSpecies++;
1780         }
1781         if (fState.curSpecies == fState.totalSpeciesRead) {
1782             PadLine (fState, result);
1783             fState.curSite = 0;
1784         }
1785         if (fState.totalSpeciesRead<fState.totalSpeciesExpected) {
1786             fState.totalSpeciesRead++;
1787         }
1788 
1789         fState.curSite+=ProcessLine (CurrentLine, &fState, result);
1790 
1791     }
1792 }
1793 
1794 //_________________________________________________________
SkipLine(_StringBuffer & theLine,FileState * fS)1795 bool SkipLine (_StringBuffer& theLine, FileState* fS) {
1796 
1797     if ( theLine.char_at(0) =='/' && theLine.char_at(1)=='/' ) {
1798         return true;
1799     }
1800 
1801     char c = theLine.FirstNonSpace();
1802 
1803     if (c&& (!( c=='$' && !fS->acceptingCommands)) ) {
1804         return false;
1805     }
1806 
1807     return true;
1808 }
1809 
1810 
1811 //_________________________________________________________
ReadNextLine(hyFile * fp,_StringBuffer * s,FileState * fs,bool,bool upCase)1812 void ReadNextLine (hyFile * fp, _StringBuffer *s, FileState* fs, bool, bool upCase) {
1813     _StringBuffer  tempBuffer (1024L);
1814 
1815     fs->currentFileLine ++;
1816 
1817     char lastc;
1818 
1819     if (fp) {
1820         lastc = fp->getc();
1821     } else {
1822         lastc = fs->pInSrc<fs->theSource->length()?fs->theSource->char_at(fs->pInSrc++):0;
1823     }
1824 
1825 
1826     if (fs->fileType != 3) { // not NEXUS - do not skip [..]
1827         if (fp)
1828             while ( !fp->feof() && lastc!=10 && lastc!=13 ) {
1829                 if (lastc) {
1830                     tempBuffer << lastc;
1831                 }
1832 
1833                 lastc = fp->getc();
1834             }
1835         else
1836             while (lastc && lastc!=10 && lastc!=13 ) {
1837                 tempBuffer << lastc;
1838                 lastc = fs->theSource->char_at(fs->pInSrc++);
1839             }
1840 
1841     } else {
1842         if (upCase) {
1843             lastc = toupper(lastc);
1844         }
1845 
1846         while (((fp&&!fp->feof())||(fs->theSource&&(fs->pInSrc<=fs->theSource->length ()))) && lastc!='\r' && lastc!='\n') {
1847             if (lastc=='[') {
1848                 if (fs->isSkippingInNEXUS) {
1849                     ReportWarning ("Nested comments in NEXUS really shouldn't be used.");
1850                 } else {
1851                     fs->isSkippingInNEXUS = true;
1852                 }
1853             }
1854             if (fs->isSkippingInNEXUS) {
1855                 if (lastc==']') {
1856                     fs->isSkippingInNEXUS = false;
1857                     tempBuffer << ' ';
1858                 }
1859             } else {
1860                 tempBuffer << lastc;
1861             }
1862 
1863             if (fp) {
1864                 if (upCase) {
1865                     lastc = toupper(fp->getc());
1866                 } else {
1867                     lastc = fp->getc();
1868                 }
1869             } else {
1870                 if (upCase) {
1871                     lastc = toupper(fs->theSource->char_at(fs->pInSrc++));
1872                 } else {
1873                     lastc = fs->theSource->char_at(fs->pInSrc++);
1874                 }
1875             }
1876 
1877         }
1878 
1879         if ( lastc==10 || lastc==13 ) {
1880             tempBuffer << ' ';
1881         }
1882     }
1883 
1884     tempBuffer.TrimSpace();
1885 
1886     if ( (fp && fp->feof ()) || (fs->theSource && fs->pInSrc >= fs->theSource->length()) ) {
1887         if (tempBuffer.empty ()) {
1888             *s = "";
1889             return;
1890         }
1891     }
1892     *s = tempBuffer;
1893 
1894     if (SkipLine (*s, fs)) {
1895         ReadNextLine(fp,s,fs,false,upCase);
1896     }
1897 
1898     if (s->nonempty() && s->char_at (s->length()-1) == '\n') {
1899         s->Trim (0,(long)s->length()-2L);
1900     }
1901 }
1902 //_________________________________________________________
TrimPhylipLine(_String & CurrentLine,_DataSet & ds)1903 void    TrimPhylipLine (_String& CurrentLine, _DataSet& ds) {
1904     int  fNS      = CurrentLine.FirstNonSpaceIndex(),
1905     space2   = CurrentLine.FirstSpaceIndex (fNS + 1);
1906 
1907     // hack for PAML support
1908     if (space2 > fNS && isspace(CurrentLine.char_at (space2+1))) {
1909         _String     sequence_name (CurrentLine,fNS, space2);
1910         CurrentLine.Trim(space2+2,-1); // chop out the name
1911         ds.AddName(sequence_name);
1912     } else {
1913         _String     sequence_name (CurrentLine,fNS, fNS+9);
1914         CurrentLine.Trim(fNS+10,-1); // chop out the name
1915         ds.AddName(sequence_name);
1916     }
1917 }
1918 
1919 
1920 //_________________________________________________________
ReadDataSetFile(hyFile * f,char execBF,_String * theS,_String * bfName,_String * namespaceID,_TranslationTable * dT,_ExecutionList * ex)1921 _DataSet* ReadDataSetFile (hyFile *f, char execBF, _String* theS, _String* bfName, _String* namespaceID, _TranslationTable* dT, _ExecutionList* ex) {
1922 
1923     static const _String kNEXUS ("#NEXUS"),
1924                          kDefSeqNamePrefix ("Species");
1925 
1926     bool     doAlphaConsistencyCheck = true;
1927     _DataSet* result = new _DataSet;
1928 
1929     try {
1930 
1931         //if (f) flockfile (f);
1932         if (f) f->lock();
1933 
1934         hy_env::EnvVariableSet(hy_env::data_file_tree_string, new _Matrix, false);
1935 
1936         _String savedLine;
1937 
1938         /*_String         CurrentLine = hy_env::data_file_tree_string & "={{}};",
1939                         savedLine;
1940 
1941         _ExecutionList reset (CurrentLine);
1942         reset.Execute();*/
1943 
1944     #ifdef __HYPHYMPI__
1945         if (hy_mpi_node_rank == 0L)
1946     #endif
1947         terminate_execution = false;
1948 
1949         hy_env::EnvVariableSet(hy_env::data_file_tree, new HY_CONSTANT_FALSE, false);
1950 
1951         // initialize the instance of a file state variable
1952         FileState   fState;
1953         fState.translationTable =  dT;
1954         fState.curSpecies =
1955         fState.totalSpeciesRead =
1956         fState.totalSitesRead =
1957         fState.totalSpeciesExpected =
1958         fState.totalSitesExpected =
1959         fState.curSite =
1960         fState.currentFileLine =
1961         fState.maxStringLength   = 0;
1962         fState.acceptingCommands = true;
1963         fState.allSpeciesDefined = false;
1964         fState.interleaved       = false;
1965         fState.isSkippingInNEXUS = false;
1966         fState.autoDetect        = true;
1967         fState.fileType          = -1;
1968         fState.baseLength        = 4;
1969         fState.repeat            = '.',
1970         fState.skip            = 0;
1971         fState.theSource         = theS;
1972         fState.pInSrc            = 0;
1973         fState.theNamespace      = namespaceID;
1974 
1975         if (! f && !theS) {
1976             throw _String ("ReadDataSetFile received null file AND string references. At least one must be specified");
1977         }
1978         // done initializing
1979 
1980         if       (f) {
1981             f->rewind  ();
1982         }
1983 
1984         _StringBuffer     CurrentLine;
1985 
1986         //if (f==NULL) return (_DataSet*)result.makeDynamic();
1987         // nothing to do
1988 
1989         //CurrentLine = kEmptyString;
1990 
1991         ReadNextLine (f,&CurrentLine,&fState);
1992         if (CurrentLine.empty()) {
1993             throw _String ("Empty File Encountered By ReadDataSet.");
1994         } else {
1995             if (CurrentLine.BeginsWith (kNEXUS,false)) {
1996                 ReadNexusFile (fState,f,(*result));
1997                 doAlphaConsistencyCheck = false;
1998             } else {
1999                 long i,j,k, filePosition = -1, saveSpecExpected = 0x7FFFFFFF;
2000                 char c;
2001                 while (CurrentLine.nonempty()) { // stuff to do
2002                                               // check if the line has a command in it
2003 
2004                     c = CurrentLine.FirstNonSpace();
2005                     while (1) {
2006                         if (fState.acceptingCommands) {
2007                             if (c == '$') { // command line
2008                                 processCommand(&CurrentLine, &fState);
2009                                 break;
2010                             }
2011                         }
2012 
2013                         if (!fState.skip) {
2014                             fState.skip = fState.translationTable->GetSkipChar();
2015                         }
2016                         fState.acceptingCommands = FALSE;
2017 
2018                         if (fState.fileType==-1) { // undecided file type - assume it is PHYLIP sequential
2019                             if ((c == '#')||(c=='>')) { // hash-mark format
2020                                 fState.fileType = 0;
2021                             } else { // assume this is a sequential PHYLIP file
2022                                 fState.fileType = 1;
2023                                 fState.interleaved = false;
2024                             }
2025 
2026                         }
2027                         // decide what to do next
2028                         // if format is PHYLIP and we do not know the expected dimensions,
2029                         //   we must read those in first
2030                         if (fState.fileType==1) { // PHYLIP
2031                             if ((filePosition<0)&&(fState.autoDetect)) {
2032                                 filePosition = (f?
2033                                                 f->tell ()
2034     #ifdef __WINDOZE__
2035                                                 -1
2036     #endif
2037                                                 :fState.pInSrc);
2038                                 savedLine = CurrentLine;
2039                             }
2040 
2041                             if (fState.totalSitesExpected==0 || fState.totalSpeciesExpected==0) { // must read dimensions first
2042                                 i = CurrentLine.FirstNonSpaceIndex();
2043                                 j = CurrentLine.FirstSpaceIndex(i);
2044                                 if (j != kNotFound) {
2045                                     k = CurrentLine.FirstNonSpaceIndex(j);
2046                                     if (k != kNotFound) { // could have dimensions
2047                                         saveSpecExpected = fState.totalSpeciesExpected = CurrentLine.Cut(i,j-1L).to_long();
2048                                         fState.totalSitesExpected=CurrentLine.Cut(k, kStringEnd).to_long();
2049                                     }
2050                                     if (CurrentLine.Find ('I', k, kStringDirectionBackward)>=0) { // interleaved
2051                                         fState.interleaved = true;
2052                                     }
2053                                 }
2054                             } else
2055                                 // now for the data crunching part
2056                                 // detect a line, diagnose it and dispatch accordingly
2057                             {
2058                                 if (fState.interleaved) {
2059                                     if (fState.totalSpeciesRead<fState.totalSpeciesExpected) {
2060                                         TrimPhylipLine (CurrentLine, (*result));
2061                                     }
2062                                     if (fState.curSite && fState.curSpecies >= saveSpecExpected &&
2063                                         fState.totalSitesRead >= fState.totalSitesExpected) {
2064                                         // reached the end of the data - see maybe there is a tree
2065                                         ReadNextLine (f,&CurrentLine,&fState);
2066                                         if (CurrentLine.nonempty()) {
2067                                             if (CurrentLine.FirstNonSpace()=='(') { // could be a tree string
2068                                                 ProcessTree (&fState,f, CurrentLine);
2069                                             }
2070                                         }
2071                                         break;
2072                                     }
2073 
2074                                 } else {
2075                                     if (fState.totalSitesRead > fState.totalSitesExpected)
2076                                         // oops - autodetect incorrectly assumed that the file was sequential
2077                                     {
2078                                         fState.curSpecies =
2079                                         fState.totalSpeciesRead =
2080                                         fState.totalSitesRead =
2081                                         fState.curSite =
2082                                         fState.totalSpeciesExpected =
2083                                         fState.totalSitesExpected =
2084                                         fState.maxStringLength = 0;
2085                                         fState.allSpeciesDefined = false;
2086                                         fState.interleaved = true;
2087                                         fState.autoDetect = true;
2088 
2089                                         if(f) {
2090                                             f->seek (filePosition, SEEK_SET);
2091                                         } else {
2092                                             fState.pInSrc = filePosition;
2093                                         }
2094 
2095                                         CurrentLine = savedLine;
2096                                         result->ForEach ([] (BaseRef site, unsigned long) -> void {
2097                                             ((_Site*)site)->TrimSpace();
2098                                         });
2099 
2100                                         result->theNames.Clear();
2101                                         result->theMap.Clear();
2102                                         result->Clear();
2103                                         result->theFrequencies.Clear();
2104                                         if (result->dsh) {
2105                                             result->dsh->incompletePatterns->Clear(false);
2106                                             delete (result->dsh);
2107                                             result->dsh = nil;
2108                                         }
2109                                         continue;
2110                                     }
2111                                     if (fState.totalSpeciesRead==0) {
2112                                         fState.totalSpeciesExpected = 1;
2113                                         if (!fState.curSite) {
2114                                             TrimPhylipLine (CurrentLine, (*result));
2115                                         }
2116                                     }
2117 
2118                                     else if (fState.curSite>=fState.totalSitesExpected) {
2119                                         fState.totalSpeciesExpected++;
2120                                         if (fState.totalSpeciesExpected>saveSpecExpected) {
2121                                             // reached the end of the data - see maybe there is a tree
2122                                             ReadNextLine (f,&CurrentLine,&fState);
2123                                             if (CurrentLine.nonempty()) {
2124                                                 if (CurrentLine.FirstNonSpace()=='(') { // could be a tree string
2125                                                     ProcessTree (&fState,f, CurrentLine);
2126                                                 }
2127                                             }
2128                                             break;
2129                                         }
2130                                         TrimPhylipLine (CurrentLine, (*result));
2131                                     }
2132                                 }
2133 
2134                                 ISelector (fState, CurrentLine, (*result));
2135                             }
2136                             break;
2137                         }
2138                         // that's all for PHYLIP
2139 
2140                         // now handle raw data case
2141                         if (fState.fileType == 2) { // raw data
2142                             FilterRawString(CurrentLine, &fState, (*result));
2143                             if (CurrentLine.nonempty()) {
2144                                 break;
2145                             }
2146                             if (ProcessLine (CurrentLine, &fState, (*result))) {
2147                                 fState.curSpecies++;
2148                                 fState.totalSpeciesRead++;
2149                             }
2150                             break;
2151                         }
2152 
2153                         // lastly, handle the auto-detect standard case
2154 
2155                         // check to see if the string defines a name
2156                         if (c=='#' || c=='>') { // a name it is
2157                             if (fState.allSpeciesDefined) { // can't define the species after data
2158                                 break;
2159                             } else {
2160                                 if ((!fState.totalSpeciesRead)&&(fState.totalSpeciesExpected>=1)) {
2161                                     fState.interleaved = TRUE;
2162                                 } else {
2163                                     fState.interleaved = FALSE;
2164                                 }
2165                                 fState.totalSpeciesExpected++;
2166                                 CurrentLine.Trim(CurrentLine.FirstNonSpaceIndex(1),kStringEnd);
2167                                 if (CurrentLine.char_at(0) == '#' || CurrentLine.char_at(0) == '>') {
2168                                     CurrentLine = kDefSeqNamePrefix &_String(fState.totalSpeciesExpected);
2169                                 }
2170                                 result->AddName (CurrentLine);
2171                             }
2172                             break;
2173                         }
2174                         // check to see if the string defines a tree
2175                         if (c=='(') {
2176                             ProcessTree (&fState,f, CurrentLine);
2177                             ReadNextLine (f,&CurrentLine,&fState);
2178                         }
2179 
2180                         // check to see where to stick the incoming line
2181 
2182                         if (fState.totalSpeciesExpected == 0) {
2183                             // raw data fed before names defined - skip
2184                             break;
2185                         }
2186                         if( fState.totalSpeciesExpected>1 && fState.totalSpeciesRead == 0) {
2187                             fState.allSpeciesDefined = TRUE;
2188                         }
2189 
2190                         // repeat the structure of PHYLIP reader
2191 
2192                         ISelector (fState, CurrentLine, (*result));
2193 
2194                         break;
2195                     }
2196 
2197                     ReadNextLine (f,&CurrentLine,&fState);
2198 
2199                 }
2200             }
2201         }
2202 
2203 
2204 
2205         if (fState.totalSitesRead && fState.interleaved && !result->InternalStorageMode()) {
2206             for (long i = fState.curSite; i<fState.totalSitesRead; i++) {
2207                 result->Compact(i);
2208             }
2209             result->ResetIHelper();
2210         }
2211 
2212         if ((!fState.interleaved)&&(fState.fileType!=2)) {
2213             PadLine (fState, (*result));
2214         }
2215 
2216 
2217 
2218         // make sure interleaved duplications are handled correctly
2219 
2220         result->Finalize();
2221         result->noOfSpecies       = fState.totalSpeciesRead;
2222         result->theTT             = fState.translationTable;
2223 
2224         // check to see if result may be an amino-acid data
2225         if (doAlphaConsistencyCheck && result->theTT == &hy_default_translation_table) {
2226             if (result->GetNoTypes() == 0)
2227                 // emptyString data set
2228                 // try binary data
2229             {
2230                 _TranslationTable *trialTable = new _TranslationTable (hy_default_translation_table);
2231                 trialTable->baseLength = 2;
2232                 if (f)  f->unlock();
2233                 _DataSet * res2 = ReadDataSetFile (f, execBF, theS, bfName, namespaceID, trialTable);
2234                 if (res2->GetNoTypes()) {
2235                     DeleteObject (result);
2236                     return res2;
2237                 }
2238                 DeleteObject (res2);
2239             } else
2240                 // check it out
2241                 if (result->CheckAlphabetConsistency()<0.5)
2242                     // less than 50% of the data in the alphabet is not in the basic alphabet
2243                 {
2244                     _TranslationTable trialTable (hy_default_translation_table);
2245                     trialTable.baseLength = 20;
2246                     (*result).theTT = &trialTable;
2247                     if ((*result).CheckAlphabetConsistency()<0.5) {
2248                         CurrentLine = "More than 50% of characters in the data are not in the alphabet.";
2249                         (*result).theTT =  &hy_default_translation_table;
2250                         ReportWarning (CurrentLine);
2251                     } else {
2252                         (*result).theTT = (_TranslationTable*)trialTable.makeDynamic();
2253                     }
2254 
2255                 }
2256 
2257         }
2258         if (nexusBFBody.nonempty()) {
2259             if (execBF == 1) {
2260                 lastNexusDataMatrix = result;
2261 
2262                 long            bfl = GetBFFunctionCount ();
2263 
2264                 _ExecutionList * nexusBF = ex ? ex :  new _ExecutionList;
2265                 if (namespaceID) {
2266                     nexusBF->SetNameSpace(*namespaceID);
2267                 }
2268 
2269 
2270                 bool do_profile = hy_env::EnvVariableTrue("_PROFILE_NEXUS_LOADS_");
2271 
2272 
2273                 nexusBF->BuildList(nexusBFBody,nil, false, true);
2274                 //_ExecutionList nexusBF (nexusBFBody,namespaceID);
2275 
2276 
2277                 if (bfName) {
2278                     nexusBF->sourceFile = *bfName;
2279                 }
2280 
2281                 if (do_profile) {
2282                     nexusBF->StartProfile();
2283                 }
2284                 nexusBF->ExecuteAndClean(bfl);
2285                 if (do_profile) {
2286                     CheckReceptacleAndStore("_NEXUS_PROFILE_DATA_", kEmptyString, false , nexusBF->CollectProfile(), false);
2287                 }
2288 
2289                 if (nexusBF != ex) {
2290                     DeleteObject (nexusBF);
2291                 } else {
2292                     ex->ClearExecutionList();
2293                     ex->Clear();
2294                 }
2295                 nexusBFBody         = kEmptyString;
2296             } else if (execBF == 0) {
2297                 nexusBFBody         = kEmptyString;
2298             }
2299         }
2300     } catch (const _String& err) {
2301         DeleteObject (result);
2302         if (f) f->unlock();
2303         HandleApplicationError(err);
2304         result = nil;
2305     }
2306     if (f) f->unlock();
2307     return result;
2308 }
2309 
2310 
2311