1 /*
2
3 HyPhy - Hypothesis Testing Using Phylogenies.
4
5 Copyright (C) 1997-now
6 Core Developers:
7 Sergei L Kosakovsky Pond (sergeilkp@icloud.com)
8 Art FY Poon (apoon42@uwo.ca)
9 Steven Weaver (sweaver@temple.edu)
10
11 Module Developers:
12 Lance Hepler (nlhepler@gmail.com)
13 Martin Smith (martin.audacis@gmail.com)
14
15 Significant contributions from:
16 Spencer V Muse (muse@stat.ncsu.edu)
17 Simon DW Frost (sdf22@cam.ac.uk)
18
19 Permission is hereby granted, free of charge, to any person obtaining a
20 copy of this software and associated documentation files (the
21 "Software"), to deal in the Software without restriction, including
22 without limitation the rights to use, copy, modify, merge, publish,
23 distribute, sublicense, and/or sell copies of the Software, and to
24 permit persons to whom the Software is furnished to do so, subject to
25 the following conditions:
26
27 The above copyright notice and this permission notice shall be included
28 in all copies or substantial portions of the Software.
29
30 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
31 OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
32 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
33 IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
34 CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
35 TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
36 SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
37
38 */
39
40 #include <ctype.h>
41
42 #include "dataset.h"
43 #include "translation_table.h"
44 #include "batchlan.h"
45 #include "site.h"
46 #include "global_object_lists.h"
47
48 using namespace hyphy_global_objects;
49
50
51 #define DATA_SET_SWITCH_THRESHOLD 100000
52
53
_DataSet(void)54 _DataSet::_DataSet(void) {
55 theTT = &hy_default_translation_table;
56 streamThrough = nil;
57 dsh = nil;
58 useHorizontalRep = false;
59 }
60
_DataSet(long l)61 _DataSet::_DataSet(long l)
62 : _List((unsigned long)l),
63 theFrequencies(
64 (unsigned long)l) // with estimated number of sites per file
65 {
66 dsh = nil;
67 streamThrough = nil;
68 theTT = &hy_default_translation_table;
69 useHorizontalRep = false;
70 }
71
72 //_______________________________________________________________________
73
_DataSet(FILE * f)74 _DataSet::_DataSet(FILE *f) {
75 dsh = nil;
76 useHorizontalRep = false;
77 theTT = &hy_default_translation_table;
78 streamThrough = f;
79 theMap << 0; // current sequence
80 theMap << 0; // current site
81 theMap << 0; // total sites
82 }
83
84 //_______________________________________________________________________
85
~_DataSet(void)86 _DataSet::~_DataSet(void) {
87 if (theTT != &hy_default_translation_table) {
88 DeleteObject(theTT);
89 }
90 }
91
92 //_______________________________________________________________________
93
Clear(bool)94 void _DataSet::Clear(bool) {
95 _List::Clear();
96 theMap.Clear();
97 theFrequencies.Clear();
98 theNames.Clear();
99 if (theTT != &hy_default_translation_table) {
100 DeleteObject(theTT);
101 theTT = &hy_default_translation_table;
102 }
103 noOfSpecies = 0;
104 if (dsh) {
105 dsh->incompletePatterns->Clear(false);
106 delete (dsh);
107 dsh = nil;
108 }
109 useHorizontalRep = false;
110 }
111
112 //_______________________________________________________________________
113
makeDynamic(void) const114 BaseRef _DataSet::makeDynamic(void) const {
115 _DataSet *r = new _DataSet;
116 r->theMap.Duplicate(&theMap);
117 r->theFrequencies.Duplicate(&theFrequencies);
118 if (theTT != &hy_default_translation_table) {
119 r->theTT->AddAReference();
120 }
121 r->theNames.Duplicate(&theNames);
122 r->streamThrough = streamThrough;
123 // 20170507: SLKP TODO why do we need an additional reference here?
124 // nInstances++;
125 r->dsh = nil;
126 r->useHorizontalRep = false;
127 return r;
128 }
129
130 //_______________________________________________________________________
131
ResetIHelper(void)132 void _DataSet::ResetIHelper(void) {
133 if (dsh && dsh->characterPositions.lLength == 256)
134 for (long k = 0; k < 256; k++) {
135 dsh->characterPositions.list_data[k] = -1;
136 }
137 }
138
139 //_______________________________________________________________________
140
ConvertRepresentations(void)141 void _DataSet::ConvertRepresentations(void) {
142 if (useHorizontalRep == false) {
143 _List horStrings;
144
145 if (lLength == 0) {
146 AppendNewInstance(new _StringBuffer (128UL));
147 } else {
148 _Site *aSite = (_Site *)list_data[0];
149
150 for (long str = 0; str < aSite->length(); str++) {
151 horStrings < new _StringBuffer (DATA_SET_SWITCH_THRESHOLD);
152 }
153
154 for (long s = 0; s < lLength; s++) {
155 _Site *aSite = (_Site *)list_data[s];
156 if (aSite->length() > horStrings.lLength || aSite->GetRefNo() != -1) {
157 HandleApplicationError("Irrecoverable internal error in "
158 "_DataSet::ConvertRepresentations. Sorry "
159 "about that.",
160 true);
161 return;
162 }
163
164 for (long s2 = 0L; s2 < aSite->length(); s2++) {
165 (*(_StringBuffer *)horStrings.list_data[s2]) << aSite->get_char(s2);
166 }
167 }
168
169 _List::Clear();
170 theFrequencies.Clear();
171 {
172 for (long s = 0; s < horStrings.lLength; s++) {
173 (*this) << horStrings(s);
174 }
175 }
176 }
177 useHorizontalRep = true;
178 }
179 }
180
181 //_______________________________________________________________________
182
AddSite(char c)183 void _DataSet::AddSite(char c) {
184 if (streamThrough) {
185 if (theMap.list_data[0] == 0) {
186 if (theMap.list_data[1] == 0) {
187 if (theNames.lLength) {
188 fprintf(streamThrough, ">%s\n", ((_String *)theNames(0))->get_str());
189 } else {
190 fprintf(streamThrough, ">Sequence 1\n");
191 }
192 AppendNewInstance(new _String(kEmptyString));
193 }
194
195 theMap.list_data[1]++;
196 theMap.list_data[2]++;
197 fputc(c, streamThrough);
198 } else {
199 HandleApplicationError("Can't add more sites to a file based data set, "
200 "when more that one sequence has been written!",
201 true);
202 }
203 } else {
204 if (useHorizontalRep == false) {
205 if (lLength < DATA_SET_SWITCH_THRESHOLD) {
206 _Site *nC = new _Site(c);
207 theFrequencies << 1L;
208 AppendNewInstance(nC);
209 return;
210 } else {
211 ConvertRepresentations();
212 }
213 }
214
215 (*((_StringBuffer *)list_data[0])) << c;
216 }
217 }
218 //_______________________________________________________________________
219
Write2Site(long index,char c)220 void _DataSet::Write2Site(long index, char c) {
221 if (streamThrough) {
222 if (index == 0) {
223 if (theMap.list_data[2] == theMap.list_data[1]) {
224 theMap.list_data[0]++;
225
226 if (theNames.lLength > theMap.list_data[0]) {
227 fprintf(streamThrough, "\n>%s\n",
228 ((_String *)theNames(theMap.list_data[0]))->get_str());
229 } else {
230 fprintf(streamThrough, "\n>Sequence %ld\n", theMap.list_data[0] + 1);
231 }
232
233 theMap.list_data[1] = 0;
234 } else {
235 HandleApplicationError("Can't write sequences of unequal lengths to a "
236 "file based data set.");
237 return;
238 }
239 } else if (index != theMap.list_data[1]) {
240 HandleApplicationError("Can't write sites which are not consecutive to a "
241 "file based data set.");
242 return;
243 }
244
245 theMap.list_data[1]++;
246 fputc(c, streamThrough);
247 } else {
248 /*if (!dsh)
249 {
250 WarnError ("Internal Error in 'Write2Site' - called Write2Site before
251 any AddSite calls"); return;
252 }*/
253
254 if (useHorizontalRep) {
255 long currentWritten = ((_String *)list_data[0])->length();
256
257 if (index >= currentWritten) {
258 HandleApplicationError("Internal Error in 'Write2Site' - index is too "
259 "high (using compact representation)");
260 return;
261 } else {
262 if (index == 0) {
263 _StringBuffer *newString = new _StringBuffer(currentWritten);
264 (*newString) << c;
265 (*this) < newString;
266 } else {
267 long s = 1;
268 for (; s < lLength; s++) {
269 _StringBuffer *aString = (_StringBuffer *)list_data[s];
270 if (aString->length() == index) {
271 (*aString) << c;
272 break;
273 }
274 }
275 if (s == lLength) {
276 HandleApplicationError("Internal Error in 'Write2Site' - no "
277 "appropriate string to write too (compact "
278 "representation)");
279 return;
280 }
281 }
282 }
283 } else {
284 if (index >= lLength) {
285 HandleApplicationError(
286 "Internal Error in 'Write2Site' - index is too high");
287 return;
288 }
289 _Site *s = (_Site *)list_data[index];
290 long rN = s->GetRefNo();
291 if (rN == -1) { // independent site
292 // dsh->incompletePatterns->Delete (s,false);
293 (*s) << c;
294 // dsh->incompletePatterns->Insert (s,index);
295 } else {
296 _Site *ss = (_Site *)list_data[rN];
297 long sL = ss->length() - 1;
298 if (ss->get_char(sL) != c) { // appending distinct char
299 s->Duplicate(ss);
300 s->set_char(sL, c);
301 theFrequencies.list_data[rN]--;
302
303 rN = dsh->incompletePatterns->Find(s);
304 if (rN >= 0) {
305 rN = dsh->incompletePatterns->GetXtra(rN);
306 /*_Site* s2 = (_Site*)list_data[rN];
307 if (s2->GetRefNo() != -1 || !s->Equal(s2))
308 {
309 WarnError ("Mapping Error");
310 }*/
311 theFrequencies[rN]++;
312 s->Clear();
313 s->SetRefNo(rN);
314 } else {
315 theFrequencies[index]++;
316 s->SetRefNo(-1);
317 dsh->incompletePatterns->Insert(s, index);
318 }
319 }
320 }
321 }
322 }
323 }
324
325
326
327 //_______________________________________________________________________
328
GetCharDimension(void) const329 unsigned long _DataSet::GetCharDimension(
330 void) const { // return the size of the alphabet space
331 return theTT->baseLength;
332 }
333
334 //_______________________________________________________________________
335
GetNoTypes(void) const336 long _DataSet::GetNoTypes(void) const // return the number of unique columns
337 {
338 return theMap.countitems();
339 }
340 //_______________________________________________________________________
341
342 unsigned long
GetFreqType(long index) const343 _DataSet::GetFreqType(long index) const { // return the frequency of a site
344 return theFrequencies(theMap(index));
345 }
346 //_______________________________________________________________________
347
SetTranslationTable(_DataSet * newTT)348 void _DataSet::SetTranslationTable(_DataSet *newTT) {
349 if (theTT && (theTT != &hy_default_translation_table)) {
350 DeleteObject(theTT);
351 }
352 theTT = (_TranslationTable *)newTT->theTT->makeDynamic();
353 }
354
355 //_______________________________________________________________________
356
SetTranslationTable(_TranslationTable * newTT)357 void _DataSet::SetTranslationTable(_TranslationTable *newTT) {
358 if (theTT && (theTT != &hy_default_translation_table)) {
359 DeleteObject(theTT);
360 }
361 theTT = (_TranslationTable *)newTT->makeDynamic();
362 }
363 //_______________________________________________________________________
Finalize(void)364 void _DataSet::Finalize(void) {
365 if (streamThrough) {
366 fclose(streamThrough);
367 streamThrough = nil;
368 theMap.Clear();
369 } else {
370 if (useHorizontalRep) {
371 bool good = true;
372 for (long s = 0; s < lLength; s++) {
373 good = good &&
374 ((_String *)list_data[0])->length() == ((_String *)list_data[s])->length();
375 }
376
377 if (!good) {
378 Clear();
379 HandleApplicationError("Internal Error in _DataSet::Finalize. Unequal "
380 "sequence lengths in compact representation",
381 true);
382 return;
383 }
384
385 _List dups;
386 _List uniquePats;
387 _AVLListX dupsAVL(&dups);
388
389 long siteCounter = ((_String *)list_data[0])->length();
390
391 for (long i1 = 0L; i1 < siteCounter; i1++) {
392 _Site *tC = new _Site();
393
394 for (long i2 = 0L; i2 < lLength; i2++) {
395 (*tC) << ((_String *)list_data[i2])->get_char(i1);
396 }
397
398 long ff = dupsAVL.Find(tC);
399 if (ff < 0) {
400 uniquePats << tC;
401 dupsAVL.Insert(tC, theFrequencies.lLength);
402 theMap << theFrequencies.lLength;
403 theFrequencies << 1;
404 } else {
405 ff = dupsAVL.GetXtra(ff);
406 theMap << ff;
407 theFrequencies.list_data[ff]++;
408 }
409
410 DeleteObject(tC);
411 }
412 dupsAVL.Clear(false);
413 _List::Clear();
414 _List::Duplicate(&uniquePats);
415 } else {
416 long j, k;
417
418 _Site *tC;
419 {
420 _List dups;
421 _AVLListX dupsAVL(&dups);
422
423 for (long i1 = 0; i1 < lLength; i1++) {
424 tC = (_Site *)list_data[i1];
425 long ff = dupsAVL.Find(tC);
426 if (ff < 0) {
427 dupsAVL.Insert(tC, i1);
428 } else {
429 ff = dupsAVL.GetXtra(ff);
430 tC->Clear();
431 tC->SetRefNo(ff);
432 theFrequencies.list_data[ff]++;
433 }
434 }
435 dupsAVL.Clear(false);
436 }
437
438 _SimpleList refs(lLength), toDelete(lLength);
439 j = 0;
440
441 for (long i1 = 0; i1 < lLength; i1++) {
442 tC = (_Site *)(*(_List *)this)(i1);
443 k = tC->GetRefNo();
444 if (k == -1) {
445 refs << j++;
446 } else {
447 toDelete << i1;
448 refs << -1;
449 }
450 }
451
452 for (long i2 = 0; i2 < lLength; i2++) {
453 tC = (_Site *)(*(_List *)this)(i2);
454 k = tC->GetRefNo();
455 if (k >= 0) {
456 j = refs.list_data[k];
457 if (j < 0) {
458 HandleApplicationError(kErrorStringDatasetRefIndexError);
459 } else {
460 refs.list_data[i2] = j;
461 }
462 }
463 }
464
465 theMap.Clear();
466 theMap.Duplicate(&refs);
467 DeleteList(toDelete);
468 theFrequencies.DeleteList(toDelete);
469
470 for (long i3 = 0; i3 < lLength; i3++) {
471 tC = (_Site *)(*(_List *)this)(i3);
472 tC->TrimSpace();
473 tC->SetRefNo(0);
474 }
475 if (dsh) {
476 dsh->incompletePatterns->Clear(false);
477 delete (dsh);
478 dsh = nil;
479 }
480 }
481 }
482 }
483 //_______________________________________________________________________
Compact(long index)484 void _DataSet::Compact(long index) {
485 if (useHorizontalRep) {
486 HandleApplicationError(
487 "Internal Error: _DataSet::Compact called with compact represntation",
488 true);
489 return;
490 }
491
492 _Site *tC = (_Site *)(*(_List *)this)(index);
493 if (tC->GetRefNo() != -1)
494 // take care of double referencing
495 {
496 _Site *tCC = tC;
497 long lastRef, count = 0;
498 do {
499 lastRef = tCC->GetRefNo();
500 count++;
501 tCC = (_Site *)(*(_List *)this)(tCC->GetRefNo());
502 } while (tCC->GetRefNo() != -1);
503 if (count > 1) {
504 theFrequencies[lastRef]++;
505 }
506
507 tC->SetRefNo(lastRef);
508 }
509 /*if (tC->GetRefNo()==-1)
510 {
511 long f = dsh->incompletePatterns->Find(tC);
512 if (f >= 0)
513 {
514 f = dsh->incompletePatterns->GetXtra (f);
515 if (f<index)
516 {
517 theFrequencies[f]++;
518 tC->Clear();
519 tC->SetRefNo(f);
520 }
521 else
522 tC->Finalize();
523 }
524 }*/
525 }
526
527 //_______________________________________________________________________
operator ()(unsigned long site,unsigned long pos,unsigned int) const528 inline char _DataSet::operator()(unsigned long site, unsigned long pos,
529 unsigned int) const {
530 return (((_String **)list_data)[theMap.list_data[site]])->get_char(pos);
531 }
532
533 //_________________________________________________________
ComputeSize(void)534 long _DataSet::ComputeSize(void) {
535 long res = sizeof(_DataSet);
536
537 res += (theMap.lLength + lLength + theFrequencies.lLength) * sizeof(long);
538 res += lLength * sizeof(_Site);
539
540 for (long i = 0; i < lLength; i++) {
541 res += ((_Site *)(*(_List *)this)(i))->length();
542 }
543
544 return res;
545 }
546
547 //_________________________________________________________
CheckAlphabetConsistency(void)548 hyFloat _DataSet::CheckAlphabetConsistency(void) {
549 long charsIn = 0L, gaps = 0L, total = 0L;
550
551 bool checks[256] = {false};
552
553 char gapChar = theTT->GetGapChar();
554
555 _String baseSymbols;
556
557 if (theTT->baseSet.length()) {
558 baseSymbols = theTT->baseSet;
559 } else if (theTT->baseLength == 4) {
560 baseSymbols = "ACGUT";
561 } else if (theTT->baseLength == 20) {
562 baseSymbols =
563 _TranslationTable::GetDefaultTable(HY_TRANSLATION_TABLE_PROTEIN);
564 } else {
565 baseSymbols =
566 _TranslationTable::GetDefaultTable(HY_TRANSLATION_TABLE_BINARY);
567 }
568
569 for (charsIn = 0; charsIn < baseSymbols.length(); charsIn++) {
570 checks[(unsigned char)baseSymbols.get_char(charsIn)] = true;
571 }
572
573 charsIn = 0;
574
575 for (long i = 0; i < lLength; i++) {
576 _String *thisColumn = (_String *)GetItem(i);
577 long w = theFrequencies.get(i);
578 for (long j = 0; j < thisColumn->length(); j++)
579 if (checks[(unsigned char)thisColumn->get_char(j)]) {
580 charsIn += w;
581 } else if (gapChar == thisColumn->get_char(j)) {
582 gaps += w;
583 }
584
585 total += w * thisColumn->length();
586 }
587
588 return (hyFloat)charsIn / (total - gaps + 1.);
589 }
590
591 //___________________________________________________
592
toStr(unsigned long)593 BaseRef _DataSet::toStr(unsigned long) {
594 _StringBuffer *s = new _StringBuffer(NoOfSpecies() * 30), *str;
595
596 (*s) << _String((long)NoOfSpecies()) << " species:";
597
598 (*s) << (_String *)theNames.toStr();
599
600 (*s) << ";\nTotal Sites:" << _String((long)GetNoTypes())
601 << ";\nDistinct Sites:" << _String((long)theFrequencies.lLength);
602
603 return s;
604 }
605
606 //___________________________________________________
607
toFileStr(FILE * dest,unsigned long padding)608 void _DataSet::toFileStr(FILE *dest, unsigned long padding) {
609 fprintf(dest, "%ld species: ", NoOfSpecies());
610 theNames.toFileStr(dest, padding);
611
612 fprintf(dest, ";\nTotal Sites: %ld", GetNoTypes());
613 fprintf(dest, ";\nDistinct Sites: %ld", theFrequencies.lLength);
614
615 /* fprintf (dest,"\n");
616 for (long j=0; j<noOfSpecies;j++)
617 {
618 fprintf (dest,"\n");
619 for (long i=0; i<theMap.lLength; i++)
620 {
621 fprintf (dest,"%c",(*this)(i,j,1));
622 }
623 }*/
624 }
625
626 //_________________________________________________________
627
AddName(_String const & s)628 void _DataSet::AddName(_String const &s) {
629 theNames.AppendNewInstance(
630 new _String(s, 0, s.FirstNonSpaceIndex(0, -1, kStringDirectionBackward)));
631 }
632
633 //_________________________________________________________
634
InsertName(_String const & name,long where)635 void _DataSet::InsertName(_String const &name, long where) {
636 theNames.InsertElement(new _String(name), where, false);
637 }
638
639 //_________________________________________________________
640
MatchIndices(_Formula & f,_SimpleList & receptacle,bool isVert,long limit,_String const * scope) const641 void _DataSet::MatchIndices(_Formula &f, _SimpleList &receptacle, bool isVert,
642 long limit, _String const *scope) const {
643 _String varName = isVert ? "siteIndex" : "speciesIndex";
644 varName = AppendContainerName(varName, scope);
645 _Variable *v = CheckReceptacle(&varName, kEmptyString, false);
646
647 // fprintf (stderr, "\n_DataSet::MatchIndices %d %s [%s] %s\n", isVert, scope
648 // ? scope->sData : "none", varName.sData, ((_String*)f.toStr())->sData);
649
650 for (long i = 0L; i < limit; i++) {
651 v->SetValue(new _Constant((hyFloat)i), false, i == 0, NULL);
652 HBLObjectRef res = f.Compute();
653 // fprintf (stderr, "%ld %g\n", i, res->Compute()->Value());
654 if (res && !CheckEqual(res->Value(), 0.0)) {
655 receptacle << i;
656 }
657 }
658 v->SetValue(new _Constant(0.0), false, false, NULL);
659 }
660
661 //_________________________________________________________
662
CheckCompatibility(_SimpleList const & ref,char concatOrCombine)663 _TranslationTable *_DataSet::CheckCompatibility(_SimpleList const &ref,
664 char concatOrCombine) {
665 _DataSet *currentSet = (_DataSet *)dataSetList(ref.Element(0));
666
667 _TranslationTable *theEnd = new _TranslationTable(*(currentSet->theTT));
668
669 long refNo =
670 concatOrCombine ? currentSet->NoOfSpecies() : currentSet->NoOfColumns();
671 char emptyStringChar = theEnd->GetSkipChar();
672
673 for (long k = 1; k < ref.lLength; k++) {
674 currentSet = (_DataSet *)dataSetList(ref.Element(k));
675
676 _TranslationTable *tryMe = theEnd->MergeTables(currentSet->theTT);
677
678 if (tryMe) {
679 if (emptyStringChar) {
680 DeleteObject(theEnd);
681 theEnd = tryMe;
682 continue;
683 } else {
684 if ((concatOrCombine && (currentSet->NoOfSpecies() == refNo)) ||
685 (!concatOrCombine && (currentSet->NoOfColumns() == refNo))) {
686 DeleteObject(theEnd);
687 theEnd = tryMe;
688 continue;
689 }
690 }
691 }
692 _String warningMessage("The data set ");
693 warningMessage =
694 warningMessage &
695 ((_String *)dataSetNamesList(ref.Element(k)))->Enquote() &
696 _String(" was found incompatible with one of the following data sets ");
697 for (long i = 0; i < k; i++) {
698 if (k) {
699 warningMessage = warningMessage & ", ";
700 }
701 warningMessage = warningMessage &
702 ((_String *)dataSetNamesList(ref.Element(k)))->Enquote();
703 }
704 HandleApplicationError(warningMessage);
705 DeleteObject(tryMe);
706 DeleteObject(theEnd);
707 return nil;
708 }
709
710 return theEnd;
711 }
712
713 //_________________________________________________________
714
HarvestFrequencies(unsigned char unit,unsigned char atom,bool posSpec,_SimpleList & hSegmentation,_SimpleList & vSegmentation,bool countGaps) const715 _Matrix * _DataSet::HarvestFrequencies (unsigned char unit, unsigned char atom, bool posSpec, _SimpleList& hSegmentation, _SimpleList& vSegmentation, bool countGaps) const {
716
717 if (hSegmentation.empty () || vSegmentation.countitems() < unit) { // revert to default (all data)
718 if (hSegmentation.empty ()) {
719 hSegmentation.Populate (NoOfSpecies(),0,1);
720 }
721 if (vSegmentation.countitems () <unit) {
722 vSegmentation.Clear();
723 vSegmentation.Populate (GetNoTypes(),0,1);
724 }
725 }
726
727 if (atom == 0 || unit%atom > 0) { // 20120814 SLKP: changed this behavior to throw errors
728 HandleApplicationError (_String("Atom must be non-zero and divide unit, had ") & _String ((long)unit) & "/" & _String ((long)atom));
729 return new _Matrix (1,1);
730 }
731
732 _Matrix * out = new _Matrix (ComputePower (theTT->LengthOfAlphabet(), atom),
733 posSpec?unit/atom:1,
734 false,
735 true);
736
737 long positions = unit/atom,
738 static_store [HYPHY_SITE_DEFAULT_BUFFER_SIZE];
739
740 _String unit_for_counting ((unsigned long)atom);
741
742 for (unsigned long site_pattern = 0UL; site_pattern + unit <= vSegmentation.lLength; site_pattern +=unit) { // loop over the set of segments
743 // make sure the partition is kosher
744
745 /*
746 if (site_pattern + unit > vSegmentation.lLength) {
747 break;
748 }
749 */
750
751 for (unsigned long primary_site = site_pattern; primary_site < site_pattern+unit; primary_site += atom) {
752
753 long index_in_pattern = (primary_site-site_pattern)/atom;
754
755 for (unsigned long sequence_index = 0; sequence_index <hSegmentation.lLength; sequence_index ++) {
756 // loop down each column
757
758 unsigned long mapped_sequence_index = hSegmentation.list_data[sequence_index];
759 // build atomic probabilities
760
761 for (unsigned long m = 0UL; m<atom; m++ ) {
762 unit_for_counting.set_char (m, (*this)(vSegmentation.list_data[primary_site+m],mapped_sequence_index,atom));
763 }
764
765 long resolution_count = theTT->MultiTokenResolutions(unit_for_counting, static_store, countGaps);
766
767 if (resolution_count > 0UL) {
768
769 hyFloat normalized = 1./resolution_count;
770
771 for (long resolution_index = 0UL; resolution_index < resolution_count; resolution_index ++) {
772 out->theData[posSpec? static_store[resolution_index]*positions+index_in_pattern: static_store[resolution_index]] += normalized;
773 }
774 }
775 }
776 }
777 }
778
779 //scale the matrix now
780
781 unsigned long row_count = out->GetHDim(),
782 column_count = out->GetVDim();
783
784 for (unsigned long column =0UL; column < column_count; column++) { // normalize each _column_ to sum to 1.
785 hyFloat sum = 0.0;
786
787 for (unsigned long row = 0UL; row < row_count; row++) {
788 sum += out->theData [row*column_count + column];
789 }
790
791 for (unsigned long row = 0UL; row < row_count; row++) {
792 out->theData [row*column_count + column] /= sum;
793 }
794 }
795
796
797 return out;
798 }
799
800
801
802
803 //_______________________________________________________________________
804
ProcessPartition(_String const & input2,_SimpleList & target,bool isVertical,int unit_length,_SimpleList const * additionalFilter,_SimpleList const * otherDimension,_String const * scope) const805 void _DataSet::ProcessPartition (_String const & input2 , _SimpleList & target , bool isVertical, int unit_length, _SimpleList const* additionalFilter, _SimpleList const* otherDimension, _String const* scope) const {
806 // TODO SLKP : 20170928 this needs serious cleanup and testing
807
808 if (input2.empty()) {
809 return;
810 }
811 // decide if the input is an enumeration or a formula
812 long totalLength;
813
814 if (additionalFilter) {
815 totalLength = additionalFilter->countitems();
816 } else {
817 totalLength = isVertical?theMap.countitems():noOfSpecies;
818 }
819
820 _String input (input2);
821
822 if (!input.IsALiteralArgument(true)) { // not a literal argument
823
824 _Formula fmla, lhs;
825 _FormulaParsingContext fpc;
826 fpc.setScope (scope);
827
828 long outcome = Parse (&fmla, input, fpc,&lhs);
829
830 if (outcome!=HY_FORMULA_EXPRESSION) {
831 HandleApplicationError(input.Enquote() & _String(" is an invalid partition specification"));
832 return;
833 }
834 HBLObjectRef fV = fmla.Compute();
835 if (fV && fV->ObjectClass()==STRING) {
836 ProcessPartition (((_FString*)fV)->get_str().Enquote(), target, isVertical, unit_length, additionalFilter, nil, scope);
837 } else {
838 _DataSet::MatchIndices (fmla, target, isVertical, totalLength, scope);
839 }
840 } else { // an explicit enumeration or a regular expression
841
842 // check to see if argument is a callback
843
844 bool is_regexp = input (0) =='/' && input (-1) == '/';
845 long is_hbl_function = -1L;
846
847 if (!is_regexp) {
848 is_hbl_function = hyphy_global_objects::FindBFFunctionName (input);
849 if (is_hbl_function >= 0) {
850 if (GetBFFunctionArgumentCount (is_hbl_function) != 2) {
851 HandleApplicationError(input.Enquote() & _String(" is not a valid callback function: must have two arguments (name, sequence for sites; string, frequencies for sites)"));
852 return;
853 }
854
855 }
856 }
857
858 if (is_regexp || is_hbl_function >= 0L) {
859 // a regular expression or a callback
860 regex_t* regex = nil;
861 _Formula filter_formula;
862
863 if (is_regexp) {
864 input.Trim(1,input.length()-2);
865 int errCode;
866 regex = _String::PrepRegExp (input, errCode, true);
867 if (errCode) {
868 HandleApplicationError(_String::GetRegExpError(errCode));
869 return;
870 }
871 }
872 // now set do the matching
873 // using only the sites that are specced in the additionalFilter
874
875 if (!isVertical) { // partitioning sequences
876
877 _FString * string_object = nil,
878 * string_name = nil;
879 if (!is_regexp) {
880 filter_formula.GetList() < new _Operation()
881 < new _Operation()
882 < new _Operation(kEmptyString,-is_hbl_function-1L);
883
884 string_object = new _FString;
885 string_name = new _FString;
886 }
887
888 const long loop_limit = additionalFilter ? additionalFilter->lLength : totalLength;
889
890 for (long specCount = 0L; specCount < loop_limit; specCount++) {
891 _String pattern ((unsigned long)theMap.countitems());
892 long seqPos = additionalFilter ? additionalFilter->Element (specCount) : specCount;
893
894
895 if (otherDimension) {
896 for (long seqSlider = 0L; seqSlider < otherDimension->lLength; seqSlider ++) {
897 pattern.set_char(seqSlider, GetSite(otherDimension->Element(seqSlider))->get_char (seqPos));
898 }
899 }
900 else {
901 for (long seqSlider = 0L; seqSlider < theMap.lLength; seqSlider ++) {
902 pattern.set_char(seqSlider, GetSite(seqSlider)->get_char (seqPos));
903 }
904 }
905
906 if (is_regexp) {
907 if (pattern.RegExpMatch (regex, 0L).countitems()) {
908 target << specCount;
909 }
910 } else {
911 string_object->SetStringContent(new _StringBuffer (pattern));
912 string_name->SetStringContent (new _StringBuffer (*GetSequenceName(seqPos)));
913 filter_formula.GetIthTerm(1)->SetNumber(string_object);
914 filter_formula.GetIthTerm(0)->SetNumber(string_name);
915 if (!CheckEqual(0.,filter_formula.Compute()->Value())) {
916 target << specCount;
917 }
918 }
919 }
920
921 if (!is_regexp) {
922 filter_formula.GetIthTerm(0)->SetNumber(nil);
923 filter_formula.GetIthTerm(1)->SetNumber(nil);
924 DeleteObject (string_object);
925 DeleteObject (string_name);
926 }
927 } else {
928
929 auto map_site = [] (const _Site* site, _String& buffer, _SimpleList const * mapper) -> void {
930 mapper->Each ([&] (long value, unsigned long index) -> void {
931 buffer.set_char (index, site->char_at (value));
932 });
933 };
934
935 bool *eligibleMarks = nil;
936
937
938
939 if (is_regexp) {
940 eligibleMarks = new bool[lLength];
941 if (additionalFilter) {
942 InitializeArray(eligibleMarks, lLength, false);
943 for (long siteIndex = 0; siteIndex < additionalFilter->lLength; siteIndex ++) {
944 eligibleMarks[theMap.list_data[additionalFilter->list_data[siteIndex]]] = true;
945 }
946 }
947 else {
948 InitializeArray(eligibleMarks, lLength, true);
949 }
950
951 _String *tempString = nil;
952 _SimpleList matches;
953 if (otherDimension) {
954 tempString = new _String (otherDimension->countitems());
955 }
956
957 for (long siteCounter = 0; siteCounter < lLength; siteCounter ++)
958 if (eligibleMarks[siteCounter]) {
959 matches.Clear();
960 if (otherDimension) {
961 map_site ((_Site*)GetItem(siteCounter), *tempString, otherDimension);
962 matches = tempString->RegExpMatch (regex, 0L);
963 } else {
964 matches = ((_Site**)list_data)[siteCounter]->RegExpMatch (regex, 0L);
965 }
966 if (matches.empty()) {
967 eligibleMarks[siteCounter] = false;
968 }
969 }
970
971 DeleteObject (tempString);
972 if (additionalFilter) {
973 for (long afi = 0; afi < additionalFilter->lLength; afi++) {
974 if (eligibleMarks[theMap.list_data[additionalFilter->list_data[afi]]]) {
975 target << afi;
976 }
977 }
978 } else {
979 theMap.Each ([&target, eligibleMarks] (long site_pattern, unsigned long index) -> void {
980 if (eligibleMarks [site_pattern]) {
981 target << index;
982 }
983 });
984 }
985 } else {
986
987
988 long freq_dimension = ComputePower (GetCharDimension(), unit_length);
989
990 if (freq_dimension > 0xffff) {
991 HandleApplicationError("The dimension of the character space is too high for callback filtering");
992 return;
993 }
994 eligibleMarks = new bool[theMap.lLength];
995
996 if (additionalFilter) {
997 InitializeArray(eligibleMarks, theMap.lLength, false);
998 for (long siteIndex = 0; siteIndex < additionalFilter->lLength; siteIndex ++) {
999 eligibleMarks[additionalFilter->list_data[siteIndex]] = true;
1000 }
1001 }
1002 else {
1003 InitializeArray(eligibleMarks, theMap.lLength, true);
1004 }
1005
1006 filter_formula.GetList() < new _Operation()
1007 < new _Operation()
1008 < new _Operation(kEmptyString,-is_hbl_function-1L);
1009
1010 _Matrix * strings = nil,
1011 * frequencies = nil;
1012
1013
1014
1015 _List string_list,
1016 string_storage;
1017
1018
1019 if (otherDimension) {
1020 for (int i = 0; i < unit_length; i++) {
1021 string_storage < new _String (otherDimension->countitems());
1022 }
1023 }
1024
1025 _SimpleList sites (unit_length, 0, 0),
1026 sequences;
1027
1028 if (otherDimension) {
1029 sequences = *otherDimension;
1030 } else {
1031 sequences.Populate((unsigned long)NoOfSpecies(), 0, 1);
1032 }
1033
1034 for (long siteCounter = 0L; siteCounter + unit_length <= theMap.lLength; siteCounter += unit_length) {
1035 long unit_space = 0L;
1036 string_list.Clear();
1037 for (; unit_space < unit_length; unit_space++) {
1038 if (eligibleMarks[siteCounter + unit_space]) {
1039 sites[unit_space] = siteCounter + unit_space;
1040 if (otherDimension) {
1041 map_site ((_Site*)GetSite(siteCounter + unit_space), *(_String*)string_storage.GetItem (unit_space), otherDimension);
1042 } else {
1043 string_list << GetSite (siteCounter + unit_space);
1044 }
1045 } else {
1046 break;
1047 }
1048 }
1049
1050 if (unit_space == unit_length) {
1051 //eligibleMarks[siteCounter + unit_space]
1052 BatchDelete(strings, frequencies);
1053 if (otherDimension) {
1054 strings = new _Matrix (string_storage);
1055 } else {
1056 strings = new _Matrix (string_list);
1057 }
1058
1059 filter_formula.GetIthTerm(0)->SetNumber(strings);
1060
1061 //(unsigned char unit, unsigned char atom, bool posSpec, _SimpleList& hSegmentation, _SimpleList& vSegmentation, bool countGaps)
1062 frequencies = HarvestFrequencies (unit_space, unit_space, false, sequences, sites, false);
1063 filter_formula.GetIthTerm(1)->SetNumber(frequencies);
1064
1065 if (!CheckEqual(0.,filter_formula.Compute()->Value())) {
1066 continue;
1067 }
1068 }
1069
1070 for (unit_space = 0L; unit_space < unit_length; unit_space++) {
1071 eligibleMarks[siteCounter + unit_space] = false;
1072 }
1073 }
1074
1075 theMap.Each ([&target, eligibleMarks] (long site_pattern, unsigned long index) -> void {
1076 if (eligibleMarks [index]) {
1077 target << index;
1078 }
1079 });
1080 // strings && frequencies will be cleaned up by the destructor of filter_formula
1081 }
1082
1083 delete [] eligibleMarks;
1084 }
1085 if (regex) {
1086 _String::FlushRegExp (regex);
1087 }
1088 } else {
1089 input = input.KillSpaces ();
1090 // now process the string
1091 long count = 0L,anchor;
1092
1093 _SimpleList numbers,
1094 links;
1095
1096 numbers.RequestSpace (1024);
1097 links.RequestSpace (1024);
1098
1099 // first check if it is has a comb filter
1100
1101 if ( input (0) =='<' && input (-1) =='>') {
1102 for (count=1; count<input.length()-1; count++) {
1103 if (input.char_at(count) != '0') {
1104 numbers<<count-1;
1105 }
1106 }
1107 if (numbers.countitems()) {
1108 long k = input.length()-2; // step size
1109 anchor = 0;
1110 if (totalLength == -1) {
1111 totalLength = theMap.lLength;
1112 }
1113 while (anchor<totalLength-k) {
1114 for (count = 0; count< numbers.lLength; count++) {
1115 target<<anchor+numbers.list_data[count];
1116 }
1117 anchor+=k;
1118 }
1119 if ( (k=totalLength-1-anchor) ) {
1120 for (count = 0; count< numbers.lLength; count++) {
1121 if (numbers.list_data[count]>k) {
1122 break;
1123 }
1124 target<<anchor+numbers.list_data[count];
1125 }
1126 }
1127 return;
1128
1129 }
1130 }
1131
1132 while (count<input.length()) {
1133 anchor = count;
1134
1135 for (; count<input.length() && isdigit(input.char_at (count)); count++) ;
1136
1137 long aNumber = (input.Cut (anchor,count-1)).to_long();
1138
1139 if (aNumber < 0) {
1140 _String warnMsg ("A negative number was found in partition specification: ");
1141 ReportWarning (warnMsg & input.Cut (0,anchor-1) & '?' & input.Cut (anchor,-1));
1142 target.Clear();
1143 return;
1144 }
1145 numbers<< aNumber;
1146
1147 char current_char = input.char_at (count);
1148
1149 if (current_char == '<' || current_char == '>') {
1150 ReportWarning (_String ("A comb partition cannot be combined with other types. The entire partition is reset to first..last") & input.Cut (0,anchor-1) & '?' & input.Cut (anchor,-1));
1151 target.Clear();
1152 return;
1153 }
1154
1155 if (current_char == '&') {
1156 links << numbers.lLength;
1157 }
1158
1159 // TODO SLKP 20171001 this needs to be checked for correctness
1160 if (current_char == ',' || count == input.length()) { // wrap it up dude
1161 if (numbers.countitems() == 1) {
1162 target<<numbers(0);
1163 } else {
1164 if (links.empty()) {
1165 if (numbers[0]>numbers[1]) { // backward order
1166 for (long k = numbers[0]; k>=numbers[1]; k--) {
1167 target<<k;
1168 }
1169 } else {
1170 for (long k = numbers[0]; k<=numbers[1]; k++) {
1171 target<<k;
1172 }
1173 }
1174 } else {
1175 // linked locations
1176 if (links.countitems() != (numbers.countitems()-2) / 2) {
1177 ReportWarning ("A part of the partition specification has not been understood and is being skipped.");
1178 target.Clear();
1179 return;
1180 } else {
1181 _SimpleList signs;
1182 signs<<(numbers(0)<numbers(1)?1:-1);
1183 for (long k = 0; k<links.lLength; k+=2) {
1184 signs<<(numbers(links(k))<numbers(links(k+1))?1:-1);
1185 }
1186
1187 for (long k=numbers(0), l=0 ; signs(0)*k<=signs(0)*numbers(1); k+=signs(0), l++) {
1188 target<<numbers(0)+l*signs(0);
1189 for (long m=0; m<links.lLength; m++) {
1190 target<<numbers(links(m))+l*signs(m+1);
1191 }
1192 }
1193 }
1194 }
1195 }
1196 numbers.Clear();
1197 links.Clear();
1198 }
1199 count++;
1200 }
1201 }
1202 }
1203 }
1204
1205 //_________________________________________________________
1206
Concatenate(_SimpleList const & ref)1207 _DataSet *_DataSet::Concatenate(_SimpleList const &ref)
1208
1209 // concatenates (adds columns together) several datasets
1210 // in case the number of species in the datasets are different the deficiencies
1211 // will be padded by omission symbols in case translation tables are different,
1212 // they will be merged, provided it can be done, otherwise the incompatible
1213 // datasets will be ignored during this operation.
1214
1215 {
1216 _TranslationTable *jointTable = CheckCompatibility(ref, 1);
1217 if (!jointTable) {
1218 return new _DataSet;
1219 }
1220
1221 _DataSet *bigDataSet = new _DataSet;
1222
1223 bigDataSet->theTT = jointTable;
1224
1225 // pass one - determine the max max number of species present and what dataset
1226 // are they coming from
1227
1228 long maxSpecies = 0, maxDataSet = 0, siteIndex;
1229
1230 _DataSet *currentSet;
1231
1232 char emptyStringSlot = jointTable->GetSkipChar();
1233
1234 for (long i = 0; i < ref.lLength; i++) {
1235 currentSet = (_DataSet *)dataSetList(ref(i));
1236
1237 long specCount = currentSet->NoOfSpecies(),
1238 siteCount = currentSet->NoOfColumns();
1239
1240 if (specCount > maxSpecies) {
1241 maxSpecies = specCount;
1242 maxDataSet = i;
1243 }
1244 for (long j = 0; j < siteCount; j++) {
1245 bigDataSet->AddSite((*currentSet)(j, 0, 1));
1246 }
1247 }
1248
1249 for (long k = 1; k < maxSpecies; k++) {
1250 siteIndex = 0;
1251 for (long i = 0; i < ref.lLength; i++) {
1252 currentSet = (_DataSet *)dataSetList(ref.list_data[i]);
1253
1254 long cns = currentSet->NoOfSpecies(), cnc = currentSet->NoOfColumns();
1255
1256 if (cns <= k)
1257 for (long j = 0; j < cnc; j++, siteIndex++) {
1258 bigDataSet->Write2Site(siteIndex, emptyStringSlot);
1259 }
1260 else
1261 for (long j = 0; j < cnc; j++, siteIndex++) {
1262 bigDataSet->Write2Site(siteIndex, (*currentSet)(j, k, 1));
1263 }
1264 }
1265 }
1266
1267 currentSet = (_DataSet *)dataSetList(ref(maxDataSet));
1268 for (long i = 0L; i < maxSpecies; i++) {
1269 bigDataSet->AddName(*currentSet->GetSequenceName(i));
1270 }
1271
1272 bigDataSet->Finalize();
1273 bigDataSet->SetNoSpecies(maxSpecies);
1274 return bigDataSet;
1275 }
1276
1277 //_________________________________________________________
1278
Combine(_SimpleList const & ref)1279 _DataSet *_DataSet::Combine(_SimpleList const &ref) {
1280
1281 // combines (adds rows together) several datasets
1282 // in case the number of species in the datasets are different the
1283 // deficiencies will be padded by omission symbols in case translation tables
1284 // are different, they will be merged, provided it can be done, otherwise the
1285 // incompatible datasets will be ignored during this operation.
1286
1287 _TranslationTable *joint_table = CheckCompatibility(ref, 0);
1288
1289 if (!joint_table) {
1290 return new _DataSet;
1291 }
1292
1293 _DataSet *combined_data = new _DataSet;
1294 combined_data->theTT = joint_table;
1295
1296 // pass one - determine the max max number of sites present and what dataset
1297 // are they coming from
1298
1299 unsigned long max_sites = 0UL, total_species_count = 0UL;
1300
1301 char emptyStringSlot = joint_table->GetSkipChar();
1302
1303 for (unsigned long set_index = 0UL; set_index < ref.lLength; set_index++) {
1304 _DataSet const *current_data_set =
1305 (_DataSet const *)dataSetList(ref.Element(set_index));
1306 StoreIfGreater(max_sites, current_data_set->NoOfColumns());
1307 total_species_count += current_data_set->NoOfSpecies();
1308 }
1309
1310 for (unsigned long set_index = 0UL; set_index < ref.lLength; set_index++) {
1311 _DataSet const *current_data_set =
1312 (_DataSet const *)dataSetList(ref.Element(set_index));
1313 unsigned long sites_in_this_set = current_data_set->NoOfColumns(),
1314 sequences_in_this_set = current_data_set->NoOfSpecies();
1315
1316 for (unsigned long seq_index = 0UL; seq_index < sequences_in_this_set;
1317 seq_index++) {
1318 combined_data->AddName(*current_data_set->GetSequenceName(seq_index));
1319 if (seq_index == 0UL && set_index == 0UL) {
1320 /* use AddSite write out the first sequence */
1321 unsigned long site_index = 0UL;
1322 for (site_index = 0UL; site_index < sites_in_this_set; site_index++) {
1323 combined_data->AddSite((*current_data_set)(site_index, 0UL, 1));
1324 }
1325 for (; site_index < max_sites; site_index++) {
1326 combined_data->AddSite(emptyStringSlot);
1327 }
1328 } else {
1329 /* use Write2Site to create subsequence sequences */
1330 unsigned long site_index = 0UL;
1331 for (site_index = 0UL; site_index < sites_in_this_set; site_index++) {
1332 combined_data->Write2Site(
1333 site_index, (*current_data_set)(site_index, seq_index, 1));
1334 }
1335 for (; site_index < max_sites; site_index++) {
1336 combined_data->Write2Site(site_index, emptyStringSlot);
1337 }
1338 }
1339 }
1340 }
1341
1342 combined_data->Finalize();
1343 combined_data->SetNoSpecies(total_species_count);
1344 return combined_data;
1345 }
1346
1347 //_______________________________________________________________________
1348
GetSequenceCharacters(long seqID) const1349 _String* _DataSet::GetSequenceCharacters (long seqID) const{
1350
1351 unsigned long upTo = NoOfColumns();
1352 _StringBuffer * aSequence = new _StringBuffer (upTo);
1353
1354 if (seqID >= 0L && seqID < noOfSpecies) {
1355 for (unsigned long k2=0UL; k2<upTo; k2++) {
1356 (*aSequence) << GetSite (k2)->char_at(seqID);
1357 }
1358 }
1359 aSequence->TrimSpace ();
1360 return aSequence;
1361 }
1362
1363 //_________________________________________________________
1364
StoreADataSet(_DataSet * ds,_String * setName)1365 bool StoreADataSet (_DataSet* ds, _String* setName) {
1366 if (!setName->IsValidIdentifier (fIDAllowCompound | fIDAllowFirstNumeric)) {
1367 HandleApplicationError (setName->Enquote() & " is not a valid identifier while constructing a DataSet");
1368 return false;
1369 }
1370
1371 long type = HY_BL_DATASET, index;
1372 _DataSet * existing_ds = (_DataSet * )_HYRetrieveBLObjectByNameMutable (*setName, type, &index, false, false);
1373
1374
1375 if (! existing_ds) {
1376 dataSetNamesList << setName;
1377 dataSetList < ds;
1378 } else {
1379
1380
1381 bool isDifferent = existing_ds->NoOfSpecies () != ds->NoOfSpecies() ||
1382 existing_ds->NoOfColumns () != ds->NoOfColumns() ||
1383 existing_ds->NoOfUniqueColumns () != ds->NoOfUniqueColumns() ||
1384 (existing_ds->GetTT () != ds->GetTT() && !(*existing_ds->GetTT () == *ds->GetTT()));
1385
1386 for (AVLListXLIteratorKeyValue filter_key_value : ObjectIndexer (HY_BL_DATASET_FILTER)) {
1387 _DataSetFilter * filter = (_DataSetFilter*) filter_key_value.get_object();
1388 if (filter->GetData() == existing_ds) {
1389 if (isDifferent) {
1390 ReportWarning (_String("Overwriting dataset '") & *setName & "' caused DataSetFilter " & GetFilterName(filter_key_value.get_index())->Enquote('\'') & " to be deleted");
1391 DeleteDataFilter(filter_key_value.get_index());
1392 } else {
1393 filter->SetData(ds);
1394 }
1395 }
1396 }
1397
1398 dataSetList.Replace(index,ds,false);
1399 }
1400
1401 CheckReceptacleAndStore (*setName&".mapping",kEmptyString,false, new _MathObject, false);
1402
1403 if (hy_env::EnvVariableTrue(hy_env::normalize_sequence_names)) {
1404 _List _id_mapping;
1405 _AVLListXL id_mapping (&_id_mapping);
1406 bool did_something = false;
1407
1408 for (unsigned long i = 0UL; i < ds->NoOfSpecies(); i ++) {
1409 _String * old_name = new _String (*ds->GetSequenceName (i));
1410 if (! old_name->IsValidIdentifier(fIDAllowFirstNumeric) ) {
1411 *ds->GetSequenceName (i) = ds->GetSequenceName (i)->ConvertToAnIdent(fIDAllowFirstNumeric);
1412 did_something = true;
1413 }
1414 if (id_mapping.Find (ds->GetSequenceName (i)) >= 0) {
1415 _String new_name (*ds->GetSequenceName (i));
1416 long suffix = 1L;
1417 do {
1418 new_name = *ds->GetSequenceName (i) & "_" & suffix++;
1419 } while (id_mapping.Find (&new_name) >= 0);
1420 *ds->GetSequenceName (i) = new_name;
1421 did_something = true;
1422 }
1423
1424 ds->GetSequenceName (i)->AddAReference();
1425 id_mapping.Insert (ds->GetSequenceName (i), (long)old_name, false, false);
1426 }
1427
1428 if (did_something) {
1429 _AssociativeList * mapping = new _AssociativeList();
1430
1431 for (AVLListXLIteratorKeyValue filter_key_value : AVLListXLIterator (&id_mapping)) {
1432 //printf ("%d => %s\n", filter_key_value.get_index(), ((_String*)filter_key_value.get_object())->get_str());
1433 mapping->MStore(*(_String *)id_mapping.Retrieve (filter_key_value.get_index()), *(_String*)filter_key_value.get_object());
1434 }
1435
1436 CheckReceptacleAndStore (*setName&".mapping",kEmptyString,false, mapping, false);
1437 }
1438 }
1439
1440 CheckReceptacleAndStore (*setName&".species",kEmptyString,false, new _Constant (ds->NoOfSpecies()), false);
1441 CheckReceptacleAndStore (*setName&".sites",kEmptyString,false, new _Constant (ds->NoOfColumns()), false);
1442 CheckReceptacleAndStore (*setName&".unique_sites",kEmptyString,false, new _Constant (ds->NoOfUniqueColumns()), false);
1443
1444 return true;
1445 }
1446
1447 //_________________________________________________________
1448 //_________________________________________________________
1449 // reading the data set file in here
1450
1451 //_________________________________________________________
checkTTStatus(FileState * fs)1452 void checkTTStatus (FileState* fs) {// check whether the translation table needs to be refreshed}
1453 if (fs->translationTable == &hy_default_translation_table) {
1454 fs->translationTable = (_TranslationTable*)hy_default_translation_table.makeDynamic();
1455 }
1456 }
1457 //_________________________________________________________
processCommand(_String * s,FileState * fs)1458 void processCommand (_String * s, FileState*fs) {
1459
1460 static const _List _CommandList (
1461 new _String ("BASESET"),
1462 new _String ("FORMAT"),
1463 new _String ("RAWLINE"),
1464 new _String ("REPEAT"),
1465 new _String ("TOKEN")
1466 );
1467
1468 static const _String kBase20 ("BASE20"),
1469 kPHYLIPi ("PHYLIPI"),
1470 kPHYLIPs ("PHYLIPS"),
1471 kRAW ("RAW");
1472
1473
1474 long f = -1,
1475 command_index;
1476
1477 for (command_index=0L; command_index<_CommandList.countitems(); ++command_index) {
1478 f = s->Find (*(_String*)_CommandList.GetItem (command_index));
1479 if ( f!= kNotFound) {
1480 break;
1481 }
1482 }
1483
1484 try {
1485 if (f==-1) { // unrecognized command
1486 return;
1487 } else {
1488 // trim the string
1489 //s->Trim (f+((_String*)CommandList(i))->Length(),-1);
1490
1491 f = s->Find (":", f + 1L + ((_String*)_CommandList.GetItem (command_index))->length());
1492
1493 if (f == kNotFound) { // poorly formed command
1494 throw (s->Enquote('[', ']') & " was not of the form COMMAND : DATA");
1495 }
1496
1497
1498 if (command_index >= 1 && command_index<=3 ) {
1499 long start = f + 1;
1500 long end = s->ExtractEnclosedExpression(start, '"', '"', 0);
1501
1502 if (end == kNotFound || end - start <= 2L) {
1503 throw (s->Enquote('[', ']') & " was not of the form COMMAND : \"DATA\" (missing quotes)");
1504 }
1505 s->Trim (start + 1L, end - 1L);
1506 } else {
1507 s->Trim (f + 1L, kStringEnd);
1508 }
1509
1510 // 's' should now contain only the payload of the command
1511
1512 switch (command_index) {
1513 char c;
1514
1515 case 4: {// new token
1516 checkTTStatus (fs);
1517 // attempt to extract a token. Looking for (e.g): "c" = "AC"
1518
1519 _SimpleList matches = s->RegExpMatch("\\\"([a-z,A-Z])\\\"\\ *=\\ *\\\"([a-z,A-Z]+)\\\"", false, true);
1520 if (matches.countitems() == 6) {
1521 fs->translationTable->AddTokenCode (matches.get (2),s->Cut (matches.get(3), matches.get(4)));
1522 } else {
1523 throw (s->Enquote('[', ']') & " was not of the form \"token\"=\"translation\"");
1524 }
1525 }
1526 break;
1527
1528
1529 case 0: { // new code set, e.g "ACGU"
1530 checkTTStatus(fs);
1531 // erase previous char definitions
1532 fs->translationTable->translationsAdded.Clear();
1533 fs->translationTable->tokensAdded = "";
1534 if (*s != kBase20) {
1535 long start = 0;
1536 long end = s->ExtractEnclosedExpression(start, '"', '"', 0);
1537 if (end == kNotFound || end - start <= 3L) {
1538 throw (s->Enquote('[', ']') & " was not of the form \"at least two letters\"");
1539 }
1540 // TODO : there is no check that baseset is actually valid (e.g. no duplicate characters etc)
1541 fs->translationTable->AddBaseSet (s->Cut (start + 1, end - 1));
1542 } else {
1543 fs->translationTable->AddBaseSet (kEmptyString);
1544 fs->translationTable->baseLength = 20;
1545 }
1546 }
1547 break;
1548
1549 case 1: //FORMAT
1550 if (*s== kPHYLIPi) { // PHYLIP Interleaved
1551 fs->fileType = 1;
1552 fs->interleaved = TRUE;
1553 } else if (*s== kPHYLIPs) { // PHYLIP sequential
1554 fs->fileType = 1;
1555 fs->interleaved = FALSE;
1556 }
1557 if (*s== kRAW) { // RAW Sequential Data (as in NEXUS)
1558 fs->fileType = 2;
1559 fs->interleaved = FALSE;
1560 }
1561 fs->autoDetect = false;
1562 break;
1563
1564 case 3: // REPEAT CHAR
1565 fs->repeat = s->get_char(0);
1566 break;
1567
1568 case 2: // RAWLINE template e.g 1,-1 skips one word at the beginning and one word at the end
1569 _List chips (s,',');
1570 chips.ForEach([&] (BaseRef number, unsigned long index) -> void {
1571 fs->rawLinesFormat<< ((_String*)number)->to_long();
1572 });
1573 break;
1574
1575 }
1576 }
1577 } catch (const _String& warning) {
1578 ReportWarning (warning);
1579 }
1580 }
1581 //_________________________________________________________
1582
FilterRawString(_String & s,FileState * fs,_DataSet & ds)1583 void FilterRawString (_String& s, FileState* fs, _DataSet & ds) {
1584 // TODO: SLKP 20180803 this needs to be tested or deprecated.
1585 s.CompressSpaces();
1586 _List words (s.Tokenize(" "));
1587
1588 long current_start = 0L,
1589 current_end = (long)words.countitems () - 1L;
1590
1591 fs->rawLinesFormat.Each([&] (long word, unsigned long idx) -> void {
1592 if (word > 0L) {
1593 current_start += word;
1594 } else {
1595 if (word < 0L) {
1596 current_end += word;
1597 } else {
1598 if (current_start < current_end) {
1599 ds.AddName (*((_String*)words.GetItem (current_start)));
1600 current_start ++;
1601 }
1602 }
1603 }
1604 });
1605
1606 if (current_start >= current_end) {
1607 s = kEmptyString;
1608 } else {
1609 s = words.Join(" ", current_start, current_end);
1610 }
1611 }
1612 //_________________________________________________________________________________________________
1613
1614
1615 //_________________________________________________________________________________________________
1616
ProcessTree(FileState * fState,hyFile * f,_StringBuffer & CurrentLine)1617 void ProcessTree (FileState *fState, hyFile * f, _StringBuffer& CurrentLine) {
1618
1619 // TODO SLKP 20180921 this does extra work to read in the tree string multiple times;
1620 // the solution is to have a proper buffer wrapper, and to
1621
1622 class _MultilineBuffer : public _StringBuffer {
1623 public:
1624
1625 _MultilineBuffer (_String const& current_line, FileState *fs, hyFile* f) : _StringBuffer (current_line) {
1626 file_state = fs;
1627 file = f;
1628 }
1629
1630 virtual char get_char(long index) {
1631 if (index >= 0L && index < s_length) {
1632 return s_data[index];
1633 } else {
1634 _StringBuffer next_line;
1635 ReadNextLine (file,&next_line,file_state, false);
1636 if (next_line.nonempty()) {
1637 *this << next_line;
1638 return get_char (index);
1639 }
1640 }
1641 return _String::default_return;
1642 }
1643
1644 FileState *file_state;
1645 hyFile * file;
1646
1647 };
1648
1649
1650 //_MultilineBuffer mlb (CurrentLine, fState, f);
1651
1652 _StringBuffer * tree_string = new _StringBuffer (128L);
1653 long start_index = 0,
1654 end_index = CurrentLine.ExtractEnclosedExpression(start_index, '(', ')', fExtractRespectQuote|fExtractRespectEscape);
1655
1656 while (start_index == kNotFound || end_index == kNotFound) {
1657 _StringBuffer next_line;
1658 ReadNextLine (f,&next_line,fState, false);
1659 CurrentLine = CurrentLine & next_line;
1660 start_index = 0L;
1661 end_index = CurrentLine.ExtractEnclosedExpression(start_index, '(', ')', fExtractRespectQuote|fExtractRespectEscape);
1662 }
1663
1664 if (start_index == kNotFound || end_index == kNotFound) {
1665 ReportWarning (tree_string->Enquote() & " has mimatched '(' and ')'");
1666 DeleteObject (tree_string);
1667 } else {
1668 *tree_string << CurrentLine.Cut (start_index, end_index);
1669 tree_string->TrimSpace();
1670 CurrentLine.Trim (end_index + 1, kStringEnd);
1671 hy_env::EnvVariableSetNamespace(hy_env::data_file_tree, new HY_CONSTANT_TRUE, fState->theNamespace, false);
1672 hy_env::EnvVariableSetNamespace(hy_env::data_file_tree_string, new _FString (tree_string), nil, false);
1673 }
1674 }
1675
1676 //_________________________________________________________
1677
ProcessLine(_String & s,FileState * fs,_DataSet & ds)1678 long ProcessLine (_String&s , FileState *fs, _DataSet& ds) {
1679 long sitesAttached = 0L;
1680
1681 try {
1682 s.Each([&] (char letter, unsigned long i) -> void {
1683 letter = toupper(letter);
1684 if (fs->translationTable->IsCharLegal(letter)) { // go on
1685 if (fs->curSpecies==0) { // add new column
1686 ds.AddSite (letter);
1687 sitesAttached++;
1688 } else { //append to exisiting column
1689 //if (c == fs->skip) continue;
1690 // check to see if this species needs to be padded
1691
1692 if (letter == fs->repeat) {
1693 if ( fs->curSite+sitesAttached >= ds.lLength) { // a dot not matched by a previously read character; ignore
1694 throw sitesAttached;
1695 }
1696
1697 letter = ((_Site*)(ds._List::operator () (fs->curSite+sitesAttached)))->get_char(0);
1698 if (letter=='\0') {
1699 letter = ((_Site*)(ds._List::operator ()
1700 (((_Site*)(ds._List::operator () (fs->curSite+sitesAttached)))->GetRefNo())))->get_char(0);
1701 }
1702 }
1703
1704 if (fs->curSite+sitesAttached+1>fs->totalSitesRead) {
1705 // pad previous species to full length
1706 _Site * newS = new _Site (fs->skip);
1707 newS->AppendNCopies(fs->skip, fs->curSpecies-1L);
1708 (*newS) << letter;
1709
1710
1711 ds.theFrequencies << 1;
1712 newS->SetRefNo(-1);
1713
1714 ds < newS;
1715 fs->totalSitesRead++;
1716 } else {
1717 ds.Write2Site (fs->curSite+sitesAttached, letter);
1718 }
1719
1720 sitesAttached++;
1721 }
1722 }
1723 });
1724 } catch (long e) {
1725 return e;
1726 }
1727
1728 // make sure that this species has enough data in it, and if not - pad it with '?'
1729
1730 if ( fs->curSite+sitesAttached<fs->totalSitesRead && fs->interleaved) {
1731 // pad this species to full length
1732 for (long j = fs->curSite+sitesAttached; j<fs->totalSitesRead; j++) {
1733 ds.Write2Site (j, fs->skip);
1734 }
1735 }
1736 if (!fs->curSpecies) {
1737 fs->totalSitesRead+=sitesAttached;
1738 }
1739 return sitesAttached;
1740 }
1741
1742 //_________________________________________________________
PadLine(FileState & fState,_DataSet & result)1743 void PadLine (FileState& fState, _DataSet& result) { // make sure that there is enough data in this line
1744 // and if not - "pad" it with '?''s
1745 if (fState.curSite<fState.totalSitesRead) // pad line if needed
1746 for (long j = fState.curSite; j<fState.totalSitesRead; j++) {
1747 result.Write2Site (j, fState.skip);
1748 }
1749 }
1750
1751 //_________________________________________________________
ISelector(FileState & fState,_StringBuffer & CurrentLine,_DataSet & result)1752 void ISelector (FileState& fState, _StringBuffer& CurrentLine, _DataSet& result) {
1753 if (fState.interleaved) { // interleaved file
1754 if (fState.curSpecies&&(!((fState.curSpecies)%fState.totalSpeciesExpected))) { // read a chunk of all species
1755 if (fState.totalSitesRead && !result.InternalStorageMode()) {
1756 for (long i = fState.curSite; i<fState.totalSitesRead; i++) {
1757 result.Compact(i);
1758 }
1759
1760 result.ResetIHelper();
1761
1762 }
1763 fState.curSite = fState.totalSitesRead;
1764 fState.curSpecies = 0;
1765 ProcessLine (CurrentLine, &fState, result);
1766 fState.curSpecies = 1;
1767 if (!fState.curSite) {
1768 fState.totalSpeciesRead++;
1769 }
1770 } else {
1771 ProcessLine (CurrentLine, &fState, result);
1772 if (!fState.curSite) {
1773 fState.totalSpeciesRead++;
1774 }
1775 fState.curSpecies++;
1776 }
1777 } else {
1778 if (fState.curSpecies+1<fState.totalSpeciesExpected) {
1779 fState.curSpecies++;
1780 }
1781 if (fState.curSpecies == fState.totalSpeciesRead) {
1782 PadLine (fState, result);
1783 fState.curSite = 0;
1784 }
1785 if (fState.totalSpeciesRead<fState.totalSpeciesExpected) {
1786 fState.totalSpeciesRead++;
1787 }
1788
1789 fState.curSite+=ProcessLine (CurrentLine, &fState, result);
1790
1791 }
1792 }
1793
1794 //_________________________________________________________
SkipLine(_StringBuffer & theLine,FileState * fS)1795 bool SkipLine (_StringBuffer& theLine, FileState* fS) {
1796
1797 if ( theLine.char_at(0) =='/' && theLine.char_at(1)=='/' ) {
1798 return true;
1799 }
1800
1801 char c = theLine.FirstNonSpace();
1802
1803 if (c&& (!( c=='$' && !fS->acceptingCommands)) ) {
1804 return false;
1805 }
1806
1807 return true;
1808 }
1809
1810
1811 //_________________________________________________________
ReadNextLine(hyFile * fp,_StringBuffer * s,FileState * fs,bool,bool upCase)1812 void ReadNextLine (hyFile * fp, _StringBuffer *s, FileState* fs, bool, bool upCase) {
1813 _StringBuffer tempBuffer (1024L);
1814
1815 fs->currentFileLine ++;
1816
1817 char lastc;
1818
1819 if (fp) {
1820 lastc = fp->getc();
1821 } else {
1822 lastc = fs->pInSrc<fs->theSource->length()?fs->theSource->char_at(fs->pInSrc++):0;
1823 }
1824
1825
1826 if (fs->fileType != 3) { // not NEXUS - do not skip [..]
1827 if (fp)
1828 while ( !fp->feof() && lastc!=10 && lastc!=13 ) {
1829 if (lastc) {
1830 tempBuffer << lastc;
1831 }
1832
1833 lastc = fp->getc();
1834 }
1835 else
1836 while (lastc && lastc!=10 && lastc!=13 ) {
1837 tempBuffer << lastc;
1838 lastc = fs->theSource->char_at(fs->pInSrc++);
1839 }
1840
1841 } else {
1842 if (upCase) {
1843 lastc = toupper(lastc);
1844 }
1845
1846 while (((fp&&!fp->feof())||(fs->theSource&&(fs->pInSrc<=fs->theSource->length ()))) && lastc!='\r' && lastc!='\n') {
1847 if (lastc=='[') {
1848 if (fs->isSkippingInNEXUS) {
1849 ReportWarning ("Nested comments in NEXUS really shouldn't be used.");
1850 } else {
1851 fs->isSkippingInNEXUS = true;
1852 }
1853 }
1854 if (fs->isSkippingInNEXUS) {
1855 if (lastc==']') {
1856 fs->isSkippingInNEXUS = false;
1857 tempBuffer << ' ';
1858 }
1859 } else {
1860 tempBuffer << lastc;
1861 }
1862
1863 if (fp) {
1864 if (upCase) {
1865 lastc = toupper(fp->getc());
1866 } else {
1867 lastc = fp->getc();
1868 }
1869 } else {
1870 if (upCase) {
1871 lastc = toupper(fs->theSource->char_at(fs->pInSrc++));
1872 } else {
1873 lastc = fs->theSource->char_at(fs->pInSrc++);
1874 }
1875 }
1876
1877 }
1878
1879 if ( lastc==10 || lastc==13 ) {
1880 tempBuffer << ' ';
1881 }
1882 }
1883
1884 tempBuffer.TrimSpace();
1885
1886 if ( (fp && fp->feof ()) || (fs->theSource && fs->pInSrc >= fs->theSource->length()) ) {
1887 if (tempBuffer.empty ()) {
1888 *s = "";
1889 return;
1890 }
1891 }
1892 *s = tempBuffer;
1893
1894 if (SkipLine (*s, fs)) {
1895 ReadNextLine(fp,s,fs,false,upCase);
1896 }
1897
1898 if (s->nonempty() && s->char_at (s->length()-1) == '\n') {
1899 s->Trim (0,(long)s->length()-2L);
1900 }
1901 }
1902 //_________________________________________________________
TrimPhylipLine(_String & CurrentLine,_DataSet & ds)1903 void TrimPhylipLine (_String& CurrentLine, _DataSet& ds) {
1904 int fNS = CurrentLine.FirstNonSpaceIndex(),
1905 space2 = CurrentLine.FirstSpaceIndex (fNS + 1);
1906
1907 // hack for PAML support
1908 if (space2 > fNS && isspace(CurrentLine.char_at (space2+1))) {
1909 _String sequence_name (CurrentLine,fNS, space2);
1910 CurrentLine.Trim(space2+2,-1); // chop out the name
1911 ds.AddName(sequence_name);
1912 } else {
1913 _String sequence_name (CurrentLine,fNS, fNS+9);
1914 CurrentLine.Trim(fNS+10,-1); // chop out the name
1915 ds.AddName(sequence_name);
1916 }
1917 }
1918
1919
1920 //_________________________________________________________
ReadDataSetFile(hyFile * f,char execBF,_String * theS,_String * bfName,_String * namespaceID,_TranslationTable * dT,_ExecutionList * ex)1921 _DataSet* ReadDataSetFile (hyFile *f, char execBF, _String* theS, _String* bfName, _String* namespaceID, _TranslationTable* dT, _ExecutionList* ex) {
1922
1923 static const _String kNEXUS ("#NEXUS"),
1924 kDefSeqNamePrefix ("Species");
1925
1926 bool doAlphaConsistencyCheck = true;
1927 _DataSet* result = new _DataSet;
1928
1929 try {
1930
1931 //if (f) flockfile (f);
1932 if (f) f->lock();
1933
1934 hy_env::EnvVariableSet(hy_env::data_file_tree_string, new _Matrix, false);
1935
1936 _String savedLine;
1937
1938 /*_String CurrentLine = hy_env::data_file_tree_string & "={{}};",
1939 savedLine;
1940
1941 _ExecutionList reset (CurrentLine);
1942 reset.Execute();*/
1943
1944 #ifdef __HYPHYMPI__
1945 if (hy_mpi_node_rank == 0L)
1946 #endif
1947 terminate_execution = false;
1948
1949 hy_env::EnvVariableSet(hy_env::data_file_tree, new HY_CONSTANT_FALSE, false);
1950
1951 // initialize the instance of a file state variable
1952 FileState fState;
1953 fState.translationTable = dT;
1954 fState.curSpecies =
1955 fState.totalSpeciesRead =
1956 fState.totalSitesRead =
1957 fState.totalSpeciesExpected =
1958 fState.totalSitesExpected =
1959 fState.curSite =
1960 fState.currentFileLine =
1961 fState.maxStringLength = 0;
1962 fState.acceptingCommands = true;
1963 fState.allSpeciesDefined = false;
1964 fState.interleaved = false;
1965 fState.isSkippingInNEXUS = false;
1966 fState.autoDetect = true;
1967 fState.fileType = -1;
1968 fState.baseLength = 4;
1969 fState.repeat = '.',
1970 fState.skip = 0;
1971 fState.theSource = theS;
1972 fState.pInSrc = 0;
1973 fState.theNamespace = namespaceID;
1974
1975 if (! f && !theS) {
1976 throw _String ("ReadDataSetFile received null file AND string references. At least one must be specified");
1977 }
1978 // done initializing
1979
1980 if (f) {
1981 f->rewind ();
1982 }
1983
1984 _StringBuffer CurrentLine;
1985
1986 //if (f==NULL) return (_DataSet*)result.makeDynamic();
1987 // nothing to do
1988
1989 //CurrentLine = kEmptyString;
1990
1991 ReadNextLine (f,&CurrentLine,&fState);
1992 if (CurrentLine.empty()) {
1993 throw _String ("Empty File Encountered By ReadDataSet.");
1994 } else {
1995 if (CurrentLine.BeginsWith (kNEXUS,false)) {
1996 ReadNexusFile (fState,f,(*result));
1997 doAlphaConsistencyCheck = false;
1998 } else {
1999 long i,j,k, filePosition = -1, saveSpecExpected = 0x7FFFFFFF;
2000 char c;
2001 while (CurrentLine.nonempty()) { // stuff to do
2002 // check if the line has a command in it
2003
2004 c = CurrentLine.FirstNonSpace();
2005 while (1) {
2006 if (fState.acceptingCommands) {
2007 if (c == '$') { // command line
2008 processCommand(&CurrentLine, &fState);
2009 break;
2010 }
2011 }
2012
2013 if (!fState.skip) {
2014 fState.skip = fState.translationTable->GetSkipChar();
2015 }
2016 fState.acceptingCommands = FALSE;
2017
2018 if (fState.fileType==-1) { // undecided file type - assume it is PHYLIP sequential
2019 if ((c == '#')||(c=='>')) { // hash-mark format
2020 fState.fileType = 0;
2021 } else { // assume this is a sequential PHYLIP file
2022 fState.fileType = 1;
2023 fState.interleaved = false;
2024 }
2025
2026 }
2027 // decide what to do next
2028 // if format is PHYLIP and we do not know the expected dimensions,
2029 // we must read those in first
2030 if (fState.fileType==1) { // PHYLIP
2031 if ((filePosition<0)&&(fState.autoDetect)) {
2032 filePosition = (f?
2033 f->tell ()
2034 #ifdef __WINDOZE__
2035 -1
2036 #endif
2037 :fState.pInSrc);
2038 savedLine = CurrentLine;
2039 }
2040
2041 if (fState.totalSitesExpected==0 || fState.totalSpeciesExpected==0) { // must read dimensions first
2042 i = CurrentLine.FirstNonSpaceIndex();
2043 j = CurrentLine.FirstSpaceIndex(i);
2044 if (j != kNotFound) {
2045 k = CurrentLine.FirstNonSpaceIndex(j);
2046 if (k != kNotFound) { // could have dimensions
2047 saveSpecExpected = fState.totalSpeciesExpected = CurrentLine.Cut(i,j-1L).to_long();
2048 fState.totalSitesExpected=CurrentLine.Cut(k, kStringEnd).to_long();
2049 }
2050 if (CurrentLine.Find ('I', k, kStringDirectionBackward)>=0) { // interleaved
2051 fState.interleaved = true;
2052 }
2053 }
2054 } else
2055 // now for the data crunching part
2056 // detect a line, diagnose it and dispatch accordingly
2057 {
2058 if (fState.interleaved) {
2059 if (fState.totalSpeciesRead<fState.totalSpeciesExpected) {
2060 TrimPhylipLine (CurrentLine, (*result));
2061 }
2062 if (fState.curSite && fState.curSpecies >= saveSpecExpected &&
2063 fState.totalSitesRead >= fState.totalSitesExpected) {
2064 // reached the end of the data - see maybe there is a tree
2065 ReadNextLine (f,&CurrentLine,&fState);
2066 if (CurrentLine.nonempty()) {
2067 if (CurrentLine.FirstNonSpace()=='(') { // could be a tree string
2068 ProcessTree (&fState,f, CurrentLine);
2069 }
2070 }
2071 break;
2072 }
2073
2074 } else {
2075 if (fState.totalSitesRead > fState.totalSitesExpected)
2076 // oops - autodetect incorrectly assumed that the file was sequential
2077 {
2078 fState.curSpecies =
2079 fState.totalSpeciesRead =
2080 fState.totalSitesRead =
2081 fState.curSite =
2082 fState.totalSpeciesExpected =
2083 fState.totalSitesExpected =
2084 fState.maxStringLength = 0;
2085 fState.allSpeciesDefined = false;
2086 fState.interleaved = true;
2087 fState.autoDetect = true;
2088
2089 if(f) {
2090 f->seek (filePosition, SEEK_SET);
2091 } else {
2092 fState.pInSrc = filePosition;
2093 }
2094
2095 CurrentLine = savedLine;
2096 result->ForEach ([] (BaseRef site, unsigned long) -> void {
2097 ((_Site*)site)->TrimSpace();
2098 });
2099
2100 result->theNames.Clear();
2101 result->theMap.Clear();
2102 result->Clear();
2103 result->theFrequencies.Clear();
2104 if (result->dsh) {
2105 result->dsh->incompletePatterns->Clear(false);
2106 delete (result->dsh);
2107 result->dsh = nil;
2108 }
2109 continue;
2110 }
2111 if (fState.totalSpeciesRead==0) {
2112 fState.totalSpeciesExpected = 1;
2113 if (!fState.curSite) {
2114 TrimPhylipLine (CurrentLine, (*result));
2115 }
2116 }
2117
2118 else if (fState.curSite>=fState.totalSitesExpected) {
2119 fState.totalSpeciesExpected++;
2120 if (fState.totalSpeciesExpected>saveSpecExpected) {
2121 // reached the end of the data - see maybe there is a tree
2122 ReadNextLine (f,&CurrentLine,&fState);
2123 if (CurrentLine.nonempty()) {
2124 if (CurrentLine.FirstNonSpace()=='(') { // could be a tree string
2125 ProcessTree (&fState,f, CurrentLine);
2126 }
2127 }
2128 break;
2129 }
2130 TrimPhylipLine (CurrentLine, (*result));
2131 }
2132 }
2133
2134 ISelector (fState, CurrentLine, (*result));
2135 }
2136 break;
2137 }
2138 // that's all for PHYLIP
2139
2140 // now handle raw data case
2141 if (fState.fileType == 2) { // raw data
2142 FilterRawString(CurrentLine, &fState, (*result));
2143 if (CurrentLine.nonempty()) {
2144 break;
2145 }
2146 if (ProcessLine (CurrentLine, &fState, (*result))) {
2147 fState.curSpecies++;
2148 fState.totalSpeciesRead++;
2149 }
2150 break;
2151 }
2152
2153 // lastly, handle the auto-detect standard case
2154
2155 // check to see if the string defines a name
2156 if (c=='#' || c=='>') { // a name it is
2157 if (fState.allSpeciesDefined) { // can't define the species after data
2158 break;
2159 } else {
2160 if ((!fState.totalSpeciesRead)&&(fState.totalSpeciesExpected>=1)) {
2161 fState.interleaved = TRUE;
2162 } else {
2163 fState.interleaved = FALSE;
2164 }
2165 fState.totalSpeciesExpected++;
2166 CurrentLine.Trim(CurrentLine.FirstNonSpaceIndex(1),kStringEnd);
2167 if (CurrentLine.char_at(0) == '#' || CurrentLine.char_at(0) == '>') {
2168 CurrentLine = kDefSeqNamePrefix &_String(fState.totalSpeciesExpected);
2169 }
2170 result->AddName (CurrentLine);
2171 }
2172 break;
2173 }
2174 // check to see if the string defines a tree
2175 if (c=='(') {
2176 ProcessTree (&fState,f, CurrentLine);
2177 ReadNextLine (f,&CurrentLine,&fState);
2178 }
2179
2180 // check to see where to stick the incoming line
2181
2182 if (fState.totalSpeciesExpected == 0) {
2183 // raw data fed before names defined - skip
2184 break;
2185 }
2186 if( fState.totalSpeciesExpected>1 && fState.totalSpeciesRead == 0) {
2187 fState.allSpeciesDefined = TRUE;
2188 }
2189
2190 // repeat the structure of PHYLIP reader
2191
2192 ISelector (fState, CurrentLine, (*result));
2193
2194 break;
2195 }
2196
2197 ReadNextLine (f,&CurrentLine,&fState);
2198
2199 }
2200 }
2201 }
2202
2203
2204
2205 if (fState.totalSitesRead && fState.interleaved && !result->InternalStorageMode()) {
2206 for (long i = fState.curSite; i<fState.totalSitesRead; i++) {
2207 result->Compact(i);
2208 }
2209 result->ResetIHelper();
2210 }
2211
2212 if ((!fState.interleaved)&&(fState.fileType!=2)) {
2213 PadLine (fState, (*result));
2214 }
2215
2216
2217
2218 // make sure interleaved duplications are handled correctly
2219
2220 result->Finalize();
2221 result->noOfSpecies = fState.totalSpeciesRead;
2222 result->theTT = fState.translationTable;
2223
2224 // check to see if result may be an amino-acid data
2225 if (doAlphaConsistencyCheck && result->theTT == &hy_default_translation_table) {
2226 if (result->GetNoTypes() == 0)
2227 // emptyString data set
2228 // try binary data
2229 {
2230 _TranslationTable *trialTable = new _TranslationTable (hy_default_translation_table);
2231 trialTable->baseLength = 2;
2232 if (f) f->unlock();
2233 _DataSet * res2 = ReadDataSetFile (f, execBF, theS, bfName, namespaceID, trialTable);
2234 if (res2->GetNoTypes()) {
2235 DeleteObject (result);
2236 return res2;
2237 }
2238 DeleteObject (res2);
2239 } else
2240 // check it out
2241 if (result->CheckAlphabetConsistency()<0.5)
2242 // less than 50% of the data in the alphabet is not in the basic alphabet
2243 {
2244 _TranslationTable trialTable (hy_default_translation_table);
2245 trialTable.baseLength = 20;
2246 (*result).theTT = &trialTable;
2247 if ((*result).CheckAlphabetConsistency()<0.5) {
2248 CurrentLine = "More than 50% of characters in the data are not in the alphabet.";
2249 (*result).theTT = &hy_default_translation_table;
2250 ReportWarning (CurrentLine);
2251 } else {
2252 (*result).theTT = (_TranslationTable*)trialTable.makeDynamic();
2253 }
2254
2255 }
2256
2257 }
2258 if (nexusBFBody.nonempty()) {
2259 if (execBF == 1) {
2260 lastNexusDataMatrix = result;
2261
2262 long bfl = GetBFFunctionCount ();
2263
2264 _ExecutionList * nexusBF = ex ? ex : new _ExecutionList;
2265 if (namespaceID) {
2266 nexusBF->SetNameSpace(*namespaceID);
2267 }
2268
2269
2270 bool do_profile = hy_env::EnvVariableTrue("_PROFILE_NEXUS_LOADS_");
2271
2272
2273 nexusBF->BuildList(nexusBFBody,nil, false, true);
2274 //_ExecutionList nexusBF (nexusBFBody,namespaceID);
2275
2276
2277 if (bfName) {
2278 nexusBF->sourceFile = *bfName;
2279 }
2280
2281 if (do_profile) {
2282 nexusBF->StartProfile();
2283 }
2284 nexusBF->ExecuteAndClean(bfl);
2285 if (do_profile) {
2286 CheckReceptacleAndStore("_NEXUS_PROFILE_DATA_", kEmptyString, false , nexusBF->CollectProfile(), false);
2287 }
2288
2289 if (nexusBF != ex) {
2290 DeleteObject (nexusBF);
2291 } else {
2292 ex->ClearExecutionList();
2293 ex->Clear();
2294 }
2295 nexusBFBody = kEmptyString;
2296 } else if (execBF == 0) {
2297 nexusBFBody = kEmptyString;
2298 }
2299 }
2300 } catch (const _String& err) {
2301 DeleteObject (result);
2302 if (f) f->unlock();
2303 HandleApplicationError(err);
2304 result = nil;
2305 }
2306 if (f) f->unlock();
2307 return result;
2308 }
2309
2310
2311