1# Copyright 2005-2008 by Frank Kauff & Cymon J. Cox. All rights reserved.
2# Revisions copyright 2014-2015 by Joe Cora (standard data)
3#
4# This file is part of the Biopython distribution and governed by your
5# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
6# Please see the LICENSE file that should have been included as part of this
7# package.
8
9"""Nexus class. Parse the contents of a NEXUS file.
10
11Based upon 'NEXUS: An extensible file format for systematic information'
12Maddison, Swofford, Maddison. 1997. Syst. Biol. 46(4):590-621
13"""
14
15from functools import reduce
16import copy
17import math
18import random
19import sys
20
21from Bio import File
22from Bio.Data import IUPACData
23from Bio.Seq import Seq
24
25from Bio.Nexus.StandardData import StandardData
26from Bio.Nexus.Trees import Tree
27
28
29INTERLEAVE = 70
30SPECIAL_COMMANDS = [
31    "charstatelabels",
32    "charlabels",
33    "taxlabels",
34    "taxset",
35    "charset",
36    "charpartition",
37    "taxpartition",
38    "matrix",
39    "tree",
40    "utree",
41    "translate",
42    "codonposset",
43    "title",
44]
45KNOWN_NEXUS_BLOCKS = ["trees", "data", "characters", "taxa", "sets", "codons"]
46PUNCTUATION = "()[]{}\\,;:=*\\'\"`+-<>"
47MRBAYESSAFE = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890_"
48WHITESPACE = " \t\n"
49# SPECIALCOMMENTS = ['!','&','%','/','\\','@'] # original list of special comments
50SPECIALCOMMENTS = [
51    "&"
52]  # supported special comment ('tree' command), all others are ignored
53CHARSET = "chars"
54TAXSET = "taxa"
55CODONPOSITIONS = "codonpositions"
56DEFAULTNEXUS = (
57    "#NEXUS\nbegin data; dimensions ntax=0 nchar=0; format datatype=dna; end; "
58)
59
60
61class NexusError(Exception):
62    """Provision for the management of Nexus exceptions."""
63
64    pass
65
66
67class CharBuffer:
68    """Helps reading NEXUS-words and characters from a buffer (semi-PRIVATE).
69
70    This class is not intended for public use (any more).
71    """
72
73    def __init__(self, string):
74        """Initialize the class."""
75        if string:
76            self.buffer = list(string)
77        else:
78            self.buffer = []
79
80    def peek(self):
81        """Return the first character from the buffer."""
82        if self.buffer:
83            return self.buffer[0]
84        else:
85            return None
86
87    def peek_nonwhitespace(self):
88        """Return the first character from the buffer, do not include spaces."""
89        b = "".join(self.buffer).strip()
90        if b:
91            return b[0]
92        else:
93            return None
94
95    def __next__(self):
96        """Iterate over NEXUS characters in the file."""
97        if self.buffer:
98            return self.buffer.pop(0)
99        else:
100            return None
101
102    def next_nonwhitespace(self):
103        """Check for next non whitespace character in NEXUS file."""
104        while True:
105            p = next(self)
106            if p is None:
107                break
108            if p not in WHITESPACE:
109                return p
110        return None
111
112    def skip_whitespace(self):
113        """Skip whitespace characters in NEXUS file."""
114        while self.buffer[0] in WHITESPACE:
115            self.buffer = self.buffer[1:]
116
117    def next_until(self, target):
118        """Iterate over the NEXUS file until a target character is reached."""
119        for t in target:
120            try:
121                pos = self.buffer.index(t)
122            except ValueError:
123                pass
124            else:
125                found = "".join(self.buffer[:pos])
126                self.buffer = self.buffer[pos:]
127                return found
128        else:
129            return None
130
131    def peek_word(self, word):
132        """Return a word stored in the buffer."""
133        return "".join(self.buffer[: len(word)]) == word
134
135    def next_word(self):
136        """Return the next NEXUS word from a string.
137
138        This deals with single and double quotes, whitespace and punctuation.
139        """
140        word = []
141        quoted = False
142        # get first character
143        first = self.next_nonwhitespace()
144        if not first:
145            # return empty if only whitespace left
146            return None
147        word.append(first)
148        if first == "'":
149            quoted = "'"
150        elif first == '"':
151            quoted = '"'
152        elif first in PUNCTUATION:
153            # if it's non-quote punctuation, return immediately
154            return first
155        while True:
156            c = self.peek()
157            if c == quoted:  # a quote?
158                word.append(next(self))  # store quote
159                if self.peek() == quoted:  # double quote
160                    next(self)  # skip second quote
161                elif quoted:  # second single quote ends word
162                    break
163            elif quoted:
164                # if quoted, then add anything
165                word.append(next(self))
166            elif not c or c in PUNCTUATION or c in WHITESPACE:
167                # if not quoted and special character, stop
168                break
169            else:
170                word.append(next(self))  # standard character
171        return "".join(word)
172
173    def rest(self):
174        """Return the rest of the string without parsing."""
175        return "".join(self.buffer)
176
177
178class StepMatrix:
179    """Calculate a stepmatrix for weighted parsimony.
180
181    See :
182    COMBINATORIAL WEIGHTS IN PHYLOGENETIC ANALYSIS - A STATISTICAL PARSIMONY PROCEDURE
183    Wheeler (1990), Cladistics 6:269-275.
184    """
185
186    def __init__(self, symbols, gap):
187        """Initialize the class."""
188        self.data = {}
189        self.symbols = sorted(symbols)
190        if gap:
191            self.symbols.append(gap)
192        for x in self.symbols:
193            for y in [s for s in self.symbols if s != x]:
194                self.set(x, y, 0)
195
196    def set(self, x, y, value):
197        """Set a given value in the matrix's position."""
198        if x > y:
199            x, y = y, x
200        self.data[x + y] = value
201
202    def add(self, x, y, value):
203        """Add the given value to existing, in matrix's position."""
204        if x > y:
205            x, y = y, x
206        self.data[x + y] += value
207
208    def sum(self):
209        """Calculate the associations, makes matrix of associations."""
210        return reduce(lambda x, y: x + y, self.data.values())
211
212    def transformation(self):
213        """Calculate the transformation matrix.
214
215        Normalizes the columns of the matrix of associations.
216        """
217        total = self.sum()
218        if total != 0:
219            for k in self.data:
220                self.data[k] = self.data[k] / float(total)
221        return self
222
223    def weighting(self):
224        """Calculate the Phylogenetic weight matrix.
225
226        Constructed from the logarithmic transformation of the
227        transformation matrix.
228        """
229        for k in self.data:
230            if self.data[k] != 0:
231                self.data[k] = -math.log(self.data[k])
232        return self
233
234    def smprint(self, name="your_name_here"):
235        """Print a stepmatrix."""
236        matrix = "usertype %s stepmatrix=%d\n" % (name, len(self.symbols))
237        matrix += "        %s\n" % "        ".join(self.symbols)
238        for x in self.symbols:
239            matrix += "[%s]".ljust(8) % x
240            for y in self.symbols:
241                if x == y:
242                    matrix += " .       "
243                else:
244                    if x > y:
245                        x1, y1 = y, x
246                    else:
247                        x1, y1 = x, y
248                    if self.data[x1 + y1] == 0:
249                        matrix += "inf.     "
250                    else:
251                        matrix += "%2.2f".ljust(10) % (self.data[x1 + y1])
252            matrix += "\n"
253        matrix += ";\n"
254        return matrix
255
256
257def safename(name, mrbayes=False):
258    """Return a taxon identifier according to NEXUS standard.
259
260    Wrap quotes around names with punctuation or whitespace, and double
261    single quotes.
262
263    mrbayes=True: write names without quotes, whitespace or punctuation
264    for the mrbayes software package.
265    """
266    if mrbayes:
267        safe = name.replace(" ", "_")
268        safe = "".join(c for c in safe if c in MRBAYESSAFE)
269    else:
270        safe = name.replace("'", "''")
271        if set(safe).intersection(set(WHITESPACE + PUNCTUATION)):
272            safe = "'" + safe + "'"
273    return safe
274
275
276def quotestrip(word):
277    """Remove quotes and/or double quotes around identifiers."""
278    if not word:
279        return None
280    while (word.startswith("'") and word.endswith("'")) or (
281        word.startswith('"') and word.endswith('"')
282    ):
283        word = word[1:-1]
284    return word
285
286
287def get_start_end(sequence, skiplist=("-", "?")):
288    """Return position of first and last character which is not in skiplist.
289
290    Skiplist defaults to ['-','?'].
291    """
292    length = len(sequence)
293    if length == 0:
294        return None, None
295    end = length - 1
296    while end >= 0 and (sequence[end] in skiplist):
297        end -= 1
298    start = 0
299    while start < length and (sequence[start] in skiplist):
300        start += 1
301    if start == length and end == -1:  # empty sequence
302        return -1, -1
303    else:
304        return start, end
305
306
307def _sort_keys_by_values(p):
308    """Return a sorted list of keys of p sorted by values of p (PRIVATE)."""
309    return sorted((pn for pn in p if p[pn]), key=lambda pn: p[pn])
310
311
312def _make_unique(l):
313    """Check all values in list are unique and return a pruned and sorted list (PRIVATE)."""
314    return sorted(set(l))
315
316
317def _unique_label(previous_labels, label):
318    """Return a unique name if label is already in previous_labels (PRIVATE)."""
319    while label in previous_labels:
320        label_split = label.split(".")
321        if label_split[-1].startswith("copy"):
322            copy_num = 1
323            if label_split[-1] != "copy":
324                copy_num = int(label_split[-1][4:]) + 1
325            new_label = "%s.copy%s" % (".".join(label_split[:-1]), copy_num)
326            label = new_label
327        else:
328            label += ".copy"
329    return label
330
331
332def _seqmatrix2strmatrix(matrix):
333    """Convert a Seq-object matrix to a plain sequence-string matrix (PRIVATE)."""
334    return {t: str(matrix[t]) for t in matrix}
335
336
337def _compact4nexus(orig_list):
338    r"""Compact lists for Nexus output (PRIVATE).
339
340    Example
341    -------
342    >>> _compact4nexus([1, 2, 3, 5, 6, 7, 8, 12, 15, 18, 20])
343    '2-4 6-9 13-19\\3 21'
344
345    Transform [1 2 3 5 6 7 8 12 15 18 20] (baseindex 0, used in the Nexus class)
346    into '2-4 6-9 13-19\\3 21' (baseindex 1, used in programs like Paup or MrBayes.).
347
348    """
349    if not orig_list:
350        return ""
351    orig_list = sorted(set(orig_list))
352    shortlist = []
353    clist = orig_list[:]
354    clist.append(clist[-1] + 0.5)  # dummy value makes it easier
355    while len(clist) > 1:
356        step = 1
357        for i, x in enumerate(clist):
358            if x == clist[0] + i * step:  # are we still in the right step?
359                continue
360            elif i == 1 and len(clist) > 3 and clist[i + 1] - x == x - clist[0]:
361                # second element, and possibly at least 3 elements to link,
362                # and the next one is in the right step
363                step = x - clist[0]
364            else:  # pattern broke, add all values before current position to new list
365                sub = clist[:i]
366                if len(sub) == 1:
367                    shortlist.append(str(sub[0] + 1))
368                else:
369                    if step == 1:
370                        shortlist.append("%d-%d" % (sub[0] + 1, sub[-1] + 1))
371                    else:
372                        shortlist.append("%d-%d\\%d" % (sub[0] + 1, sub[-1] + 1, step))
373                clist = clist[i:]
374                break
375    return " ".join(shortlist)
376
377
378def combine(matrices):
379    """Combine matrices in [(name,nexus-instance),...] and return new nexus instance.
380
381    combined_matrix=combine([(name1,nexus_instance1),(name2,nexus_instance2),...]
382    Character sets, character partitions and taxon sets are prefixed, readjusted
383    and present in the combined matrix.
384    """
385    if not matrices:
386        return None
387    name = matrices[0][0]
388    combined = copy.deepcopy(matrices[0][1])  # initiate with copy of first matrix
389    mixed_datatypes = len({n[1].datatype for n in matrices}) > 1
390    if mixed_datatypes:
391        # dealing with mixed matrices is application specific.
392        # You take care of that yourself!
393        combined.datatype = "None"
394    #    raise NexusError('Matrices must be of same datatype')
395    combined.charlabels = None
396    combined.statelabels = None
397    combined.interleave = False
398    combined.translate = None
399
400    # rename taxon sets and character sets and name them with prefix
401    for cn, cs in combined.charsets.items():
402        combined.charsets["%s.%s" % (name, cn)] = cs
403        del combined.charsets[cn]
404    for tn, ts in combined.taxsets.items():
405        combined.taxsets["%s.%s" % (name, tn)] = ts
406        del combined.taxsets[tn]
407    # previous partitions usually don't make much sense in combined matrix
408    # just initiate one new partition parted by single matrices
409    combined.charpartitions = {"combined": {name: list(range(combined.nchar))}}
410    for n, m in matrices[1:]:  # add all other matrices
411        both = [t for t in combined.taxlabels if t in m.taxlabels]
412        combined_only = [t for t in combined.taxlabels if t not in both]
413        m_only = [t for t in m.taxlabels if t not in both]
414        for t in both:
415            # concatenate sequences and unify gap and missing character symbols
416            combined.matrix[t] += Seq(
417                str(m.matrix[t])
418                .replace(m.gap, combined.gap)
419                .replace(m.missing, combined.missing),
420            )
421        # replace date of missing taxa with symbol for missing data
422        for t in combined_only:
423            combined.matrix[t] += Seq(combined.missing * m.nchar)
424        for t in m_only:
425            combined.matrix[t] = Seq(combined.missing * combined.nchar) + Seq(
426                str(m.matrix[t])
427                .replace(m.gap, combined.gap)
428                .replace(m.missing, combined.missing),
429            )
430        combined.taxlabels.extend(m_only)  # new taxon list
431        for cn, cs in m.charsets.items():  # adjust character sets for new matrix
432            combined.charsets["%s.%s" % (n, cn)] = [x + combined.nchar for x in cs]
433        if m.taxsets:
434            if not combined.taxsets:
435                combined.taxsets = {}
436            # update taxon sets
437            combined.taxsets.update(
438                {"%s.%s" % (n, tn): ts for tn, ts in m.taxsets.items()}
439            )
440        # update new charpartition
441        combined.charpartitions["combined"][n] = list(
442            range(combined.nchar, combined.nchar + m.nchar)
443        )
444        # update charlabels
445        if m.charlabels:
446            if not combined.charlabels:
447                combined.charlabels = {}
448            combined.charlabels.update(
449                {combined.nchar + i: label for i, label in m.charlabels.items()}
450            )
451        combined.nchar += m.nchar  # update nchar and ntax
452        combined.ntax += len(m_only)
453
454    # some prefer partitions, some charsets:
455    # make separate charset for ecah initial dataset
456    for c in combined.charpartitions["combined"]:
457        combined.charsets[c] = combined.charpartitions["combined"][c]
458
459    return combined
460
461
462def _kill_comments_and_break_lines(text):
463    r"""Delete []-delimited comments out of a file and break into lines separated by ';' (PRIVATE).
464
465    stripped_text=_kill_comments_and_break_lines(text):
466    Nested and multiline comments are allowed. [ and ] symbols within single
467    or double quotes are ignored, newline ends a quote, all symbols with quotes are
468    treated the same (thus not quoting inside comments like [this character ']' ends a comment])
469    Special [&...] and [\...] comments remain untouched, if not inside standard comment.
470    Quotes inside special [& and [\ are treated as normal characters,
471    but no nesting inside these special comments allowed (like [&   [\   ]]).
472    ';' ist deleted from end of line.
473
474    NOTE: this function is very slow for large files, and obsolete when using C extension cnexus
475    """
476    if not text:
477        return ""
478    contents = iter(text)
479    newtext = []
480    newline = []
481    quotelevel = ""
482    speciallevel = False
483    commlevel = 0
484    # Parse with one character look ahead (for special comments)
485    t2 = next(contents)
486    while True:
487        t = t2
488        try:
489            t2 = next(contents)
490        except StopIteration:
491            t2 = None
492        if t is None:
493            break
494        if t == quotelevel and not (commlevel or speciallevel):
495            # matching quote ends quotation
496            quotelevel = ""
497        elif (
498            not quotelevel
499            and not (commlevel or speciallevel)
500            and (t == '"' or t == "'")
501        ):
502            # single or double quote starts quotation
503            quotelevel = t
504        elif not quotelevel and t == "[":
505            # opening bracket outside a quote
506            if t2 in SPECIALCOMMENTS and commlevel == 0 and not speciallevel:
507                speciallevel = True
508            else:
509                commlevel += 1
510        elif not quotelevel and t == "]":
511            # closing bracket ioutside a quote
512            if speciallevel:
513                speciallevel = False
514            else:
515                commlevel -= 1
516                if commlevel < 0:
517                    raise NexusError("Nexus formatting error: unmatched ]")
518                continue
519        if commlevel == 0:
520            # copy if we're not in comment
521            if t == ";" and not quotelevel:
522                newtext.append("".join(newline))
523                newline = []
524            else:
525                newline.append(t)
526    # level of comments should be 0 at the end of the file
527    if newline:
528        newtext.append("\n".join(newline))
529    if commlevel > 0:
530        raise NexusError("Nexus formatting error: unmatched [")
531    return newtext
532
533
534def _adjust_lines(lines):
535    """Adjust linebreaks to match ';', strip leading/trailing whitespace (PRIVATE).
536
537    list_of_commandlines=_adjust_lines(input_text)
538    Lines are adjusted so that no linebreaks occur within a commandline
539    (except matrix command line)
540    """
541    formatted_lines = []
542    for line in lines:
543        # Convert line endings
544        line = line.replace("\r\n", "\n").replace("\r", "\n").strip()
545        if line.lower().startswith("matrix"):
546            formatted_lines.append(line)
547        else:
548            line = line.replace("\n", " ")
549            if line:
550                formatted_lines.append(line)
551    return formatted_lines
552
553
554def _replace_parenthesized_ambigs(seq, rev_ambig_values):
555    """Replace ambigs in xxx(ACG)xxx format by IUPAC ambiguity code (PRIVATE)."""
556    opening = seq.find("(")
557    while opening > -1:
558        closing = seq.find(")")
559        if closing < 0:
560            raise NexusError("Missing closing parenthesis in: " + seq)
561        elif closing < opening:
562            raise NexusError("Missing opening parenthesis in: " + seq)
563        ambig = "".join(sorted(seq[opening + 1 : closing]))
564        ambig_code = rev_ambig_values[ambig.upper()]
565        if ambig != ambig.upper():
566            ambig_code = ambig_code.lower()
567        seq = seq[:opening] + ambig_code + seq[closing + 1 :]
568        opening = seq.find("(")
569    return seq
570
571
572class Commandline:
573    """Represent a commandline as command and options."""
574
575    def __init__(self, line, title):
576        """Initialize the class."""
577        self.options = {}
578        options = []
579        self.command = None
580        try:
581            # Assume matrix (all other command lines have been stripped of \n)
582            self.command, options = line.strip().split("\n", 1)
583        except ValueError:  # Not matrix
584            # self.command,options=line.split(' ',1)  # no: could be tab or spaces (translate...)
585            self.command = line.split()[0]
586            options = " ".join(line.split()[1:])
587        self.command = self.command.strip().lower()
588        if self.command in SPECIAL_COMMANDS:
589            # special command that need newlines and order of options preserved
590            self.options = options.strip()
591        else:
592            if len(options) > 0:
593                try:
594                    options = options.replace("=", " = ").split()
595                    valued_indices = [
596                        (n - 1, n, n + 1)
597                        for n in range(len(options))
598                        if options[n] == "=" and n != 0 and n != len(options)
599                    ]
600                    indices = []
601                    for sl in valued_indices:
602                        indices.extend(sl)
603                    token_indices = [n for n in range(len(options)) if n not in indices]
604                    for opt in valued_indices:
605                        # self.options[options[opt[0]].lower()] = options[opt[2]].lower()
606                        self.options[options[opt[0]].lower()] = options[opt[2]]
607                    for token in token_indices:
608                        self.options[options[token].lower()] = None
609                except ValueError:
610                    raise NexusError(
611                        "Incorrect formatting in line: %s" % line
612                    ) from None
613
614
615class Block:
616    """Represent a NEXUS block with block name and list of commandlines."""
617
618    def __init__(self, title=None):
619        """Initialize the class."""
620        self.title = title
621        self.commandlines = []
622
623
624class Nexus:
625    """Create the Nexus class, main class for the management of Nexus files."""
626
627    def __init__(self, input=None):
628        """Initialize the class."""
629        self.ntax = 0  # number of taxa
630        self.nchar = 0  # number of characters
631        self.unaltered_taxlabels = (
632            []
633        )  # taxlabels as the appear in the input file (incl. duplicates, etc.)
634        self.taxlabels = []  # labels for taxa, ordered by their id
635        self.charlabels = None  # ... and for characters
636        self.statelabels = None  # ... and for states
637        self.datatype = "dna"  # (standard), dna, rna, nucleotide, protein
638        self.respectcase = False  # case sensitivity
639        self.missing = "?"  # symbol for missing characters
640        self.gap = "-"  # symbol for gap
641        self.symbols = None  # set of symbols
642        self.equate = None  # set of symbol synonyms
643        self.matchchar = None  # matching char for matrix representation
644        self.labels = None  # left, right, no
645        self.transpose = False  # whether matrix is transposed
646        self.interleave = False  # whether matrix is interleaved
647        self.tokens = False  # unsupported
648        self.eliminate = None  # unsupported
649        self.matrix = None  # ...
650        self.unknown_blocks = []  # blocks we don't care about
651        self.taxsets = {}
652        self.charsets = {}
653        self.charpartitions = {}
654        self.taxpartitions = {}
655        self.trees = []  # list of Trees (instances of Tree class)
656        self.translate = None  # Dict to translate taxon <-> taxon numbers
657        self.structured = []  # structured input representation
658        self.set = {}  # dict of the set command to set various options
659        self.options = {}  # dict of the options command in the data block
660        self.codonposset = (
661            None  # name of the charpartition that defines codon positions
662        )
663
664        # some defaults
665        self.options["gapmode"] = "missing"
666
667        if input:
668            self.read(input)
669        else:
670            self.read(DEFAULTNEXUS)
671
672    def get_original_taxon_order(self):
673        """Included for backwards compatibility (DEPRECATED)."""
674        return self.taxlabels
675
676    def set_original_taxon_order(self, value):
677        """Included for backwards compatibility (DEPRECATED)."""
678        self.taxlabels = value
679
680    original_taxon_order = property(get_original_taxon_order, set_original_taxon_order)
681
682    def read(self, input):
683        """Read and parse NEXUS input (a filename, file-handle, or string)."""
684        # 1. Assume we have the name of a file in the execution dir or a
685        # file-like object.
686        # Note we need to add parsing of the path to dir/filename
687        try:
688            with File.as_handle(input) as fp:
689                file_contents = fp.read()
690                self.filename = getattr(fp, "name", "Unknown_nexus_file")
691        except (TypeError, OSError, AttributeError):
692            # 2. Assume we have a string from a fh.read()
693            if isinstance(input, str):
694                file_contents = input
695                self.filename = "input_string"
696            else:
697                print(input.strip()[:50])
698                raise NexusError("Unrecognized input: %s ..." % input[:100]) from None
699        file_contents = file_contents.strip()
700        if file_contents.startswith("#NEXUS"):
701            file_contents = file_contents[6:]
702        commandlines = _get_command_lines(file_contents)
703        # get rid of stupid 'NEXUS token - in merged treefiles, this might appear multiple times'
704        for i, cl in enumerate(commandlines):
705            try:
706                if cl[:6].upper() == "#NEXUS":
707                    commandlines[i] = cl[6:].strip()
708            except IndexError:
709                pass
710        # now loop through blocks (we parse only data in known blocks, thus ignoring non-block commands
711        nexus_block_gen = self._get_nexus_block(commandlines)
712        while True:
713            try:
714                title, contents = next(nexus_block_gen)
715            except StopIteration:
716                break
717            if title in KNOWN_NEXUS_BLOCKS:
718                self._parse_nexus_block(title, contents)
719            else:
720                self._unknown_nexus_block(title, contents)
721
722    def _get_nexus_block(self, file_contents):
723        """Return a generator for looping through Nexus blocks (PRIVATE)."""
724        inblock = False
725        blocklines = []
726        while file_contents:
727            cl = file_contents.pop(0)
728            if cl.lower().startswith("begin"):
729                if not inblock:
730                    inblock = True
731                    title = cl.split()[1].lower()
732                else:
733                    raise NexusError("Illegal block nesting in block %s" % title)
734            elif cl.lower().startswith("end"):
735                if inblock:
736                    inblock = False
737                    yield title, blocklines
738                    blocklines = []
739                else:
740                    raise NexusError("Unmatched 'end'.")
741            elif inblock:
742                blocklines.append(cl)
743
744    def _unknown_nexus_block(self, title, contents):
745        block = Block()
746        block.commandlines.append(contents)
747        block.title = title
748        self.unknown_blocks.append(block)
749
750    def _parse_nexus_block(self, title, contents):
751        """Parse a known Nexus Block (PRIVATE)."""
752        # attached the structured block representation
753        self._apply_block_structure(title, contents)
754        # now check for taxa,characters,data blocks. If this stuff is defined more than once
755        # the later occurrences will override the previous ones.
756        block = self.structured[-1]
757        for line in block.commandlines:
758            try:
759                getattr(self, "_" + line.command)(line.options)
760            except AttributeError:
761                raise NexusError("Unknown command: %s " % line.command) from None
762
763    def _title(self, options):
764        pass
765
766    def _link(self, options):
767        pass
768
769    def _dimensions(self, options):
770        if "ntax" in options:
771            self.ntax = eval(options["ntax"])
772        if "nchar" in options:
773            self.nchar = eval(options["nchar"])
774
775    def _format(self, options):
776        # print options
777        # we first need to test respectcase, then symbols (which depends on respectcase)
778        # then datatype (which, if standard, depends on symbols and respectcase in order to generate
779        # dicts for ambiguous values
780        if "respectcase" in options:
781            self.respectcase = True
782        # adjust symbols to for respectcase
783        if "symbols" in options:
784            self.symbols = "".join(options["symbols"].split())
785            if (self.symbols.startswith('"') and self.symbols.endswith('"')) or (
786                self.symbols.startswith("'") and self.symbols.endswith("'")
787            ):
788                self.symbols = self.symbols[1:-1]
789            if not self.respectcase:
790                self.symbols = list(self.symbols.upper())
791                # self.symbols = self.symbols.lower() + self.symbols.upper()
792                # self.symbols = list(set(self.symbols))
793        if "datatype" in options:
794            self.datatype = options["datatype"].lower()
795
796            if self.datatype == "dna" or self.datatype == "nucleotide":
797                self.ambiguous_values = IUPACData.ambiguous_dna_values.copy()
798                self.unambiguous_letters = IUPACData.unambiguous_dna_letters
799            elif self.datatype == "rna":
800                self.ambiguous_values = IUPACData.ambiguous_rna_values.copy()
801                self.unambiguous_letters = IUPACData.unambiguous_rna_letters
802            elif self.datatype == "protein":
803                self.ambiguous_values = {
804                    "B": "DN",
805                    "Z": "EQ",
806                    "X": IUPACData.protein_letters,
807                }
808                # that's how PAUP handles it
809                self.unambiguous_letters = IUPACData.protein_letters + "*"  # stop-codon
810            elif self.datatype == "standard":
811                self.ambiguous_values = {}
812                if not self.symbols:
813                    # PARSER BUG ##
814                    # This error arises when symbols are absent or when
815                    # whitespace is located within the SYMBOLS command values.
816                    # The Nexus parser quits reading the SYMBOLS line upon
817                    # finding a whitespace character.
818                    raise NexusError(
819                        "Symbols must be defined when using standard datatype. "
820                        "Please remove any whitespace (spaces, tabs, etc.) "
821                        "between values for symbols as this confuses the Nexus parser."
822                    )
823
824                self.unambiguous_letters = "".join(self.symbols)
825                if not self.respectcase:
826                    self.unambiguous_letters += self.unambiguous_letters.lower()
827            else:
828                raise NexusError("Unsupported datatype: " + self.datatype)
829            self.valid_characters = (
830                "".join(self.ambiguous_values) + self.unambiguous_letters
831            )
832            if not self.respectcase:
833                self.valid_characters = (
834                    self.valid_characters.lower() + self.valid_characters.upper()
835                )
836            # we have to sort the reverse ambig coding dict key characters:
837            # to be sure that it's 'ACGT':'N' and not 'GTCA':'N'
838            rev = {v: k for k, v in self.ambiguous_values.items() if k != "X"}
839            self.rev_ambiguous_values = {}
840            for k, v in rev.items():
841                key = sorted(c for c in k)
842                self.rev_ambiguous_values["".join(key)] = v
843        # overwrite symbols for datatype rna,dna,nucleotide
844        if self.datatype in ["dna", "nucleotide"]:
845            self.symbols = IUPACData.ambiguous_dna_letters
846            if self.missing not in self.ambiguous_values:
847                self.ambiguous_values[self.missing] = (
848                    self.unambiguous_letters + self.gap
849                )
850            self.ambiguous_values[self.gap] = self.gap
851        elif self.datatype == "rna":
852            self.symbols = IUPACData.ambiguous_rna_letters
853            if self.missing not in self.ambiguous_values:
854                self.ambiguous_values[self.missing] = (
855                    self.unambiguous_letters + self.gap
856                )
857            self.ambiguous_values[self.gap] = self.gap
858        # elif self.datatype == 'standard':
859        #    if not self.symbols:
860        #        self.symbols = ['0', '1']
861        if "missing" in options:
862            self.missing = options["missing"][0]
863        if "gap" in options:
864            self.gap = options["gap"][0]
865        if "equate" in options:
866            self.equate = options["equate"]
867        if "matchchar" in options:
868            self.matchchar = options["matchchar"][0]
869        if "labels" in options:
870            self.labels = options["labels"]
871        if "transpose" in options:
872            # self.transpose = True
873            raise NexusError("TRANSPOSE is not supported!")
874        if "interleave" in options:
875            if options["interleave"] is None or options["interleave"].lower() == "yes":
876                self.interleave = True
877        if "tokens" in options:
878            self.tokens = True
879        if "notokens" in options:
880            self.tokens = False
881
882    def _set(self, options):
883        self.set = options
884
885    def _options(self, options):
886        self.options = options
887
888    def _eliminate(self, options):
889        self.eliminate = options
890
891    def _taxlabels(self, options):
892        """Get taxon labels (PRIVATE).
893
894        As the taxon names are already in the matrix, this is superfluous
895        except for transpose matrices, which are currently unsupported anyway.
896        Thus, we ignore the taxlabels command to make handling of duplicate
897        taxon names easier.
898        """
899        pass
900        # self.taxlabels = []
901        # opts = CharBuffer(options)
902        # while True:
903        #    taxon = quotestrip(opts.next_word())
904        #    if not taxon:
905        #        break
906        #    self.taxlabels.append(taxon)
907
908    def _check_taxlabels(self, taxon):
909        """Check for presence of taxon in self.taxlabels (PRIVATE)."""
910        # According to NEXUS standard, underscores shall be treated as spaces...,
911        # so checking for identity is more difficult
912        nextaxa = {t.replace(" ", "_"): t for t in self.taxlabels}
913        nexid = taxon.replace(" ", "_")
914        return nextaxa.get(nexid)
915
916    def _charlabels(self, options):
917        """Get labels for characters (PRIVATE)."""
918        self.charlabels = {}
919        opts = CharBuffer(options)
920        while True:
921            # get id and state
922            w = opts.next_word()
923            if (
924                w is None
925            ):  # McClade saves and reads charlabel-lists with terminal comma?!
926                break
927            identifier = self._resolve(w, set_type=CHARSET)
928            state = quotestrip(opts.next_word())
929            self.charlabels[identifier] = state
930            # check for comma or end of command
931            c = opts.next_nonwhitespace()
932            if c is None:
933                break
934            elif c != ",":
935                raise NexusError("Missing ',' in line %s." % options)
936
937    def _charstatelabels(self, options):
938        self.charlabels = {}
939        self.statelabels = {}
940        opts = CharBuffer(options)
941
942        # Make sure symbols are defined
943        if not self.symbols:
944            raise NexusError("Symbols must be defined when using character states")
945
946        while True:
947            # get id and character name
948            w = opts.next_word()
949
950            # McClade saves and reads charlabel-lists with terminal comma?!
951            if w is None:
952                break
953
954            identifier = self._resolve(w, set_type=CHARSET)
955            character = quotestrip(opts.next_word())
956
957            self.charlabels[identifier] = character
958            self.statelabels[identifier] = []
959
960            # check for comma, slash or end of command
961            c = opts.next_nonwhitespace()
962
963            if c is None:
964                break
965            elif c != ",":
966                # Check if states are defined, otherwise report error
967                if c != "/":
968                    raise NexusError("Missing ',' in line %s." % options)
969
970                # Get the first state
971                state = quotestrip(opts.next_word())
972
973                if state is None:
974                    raise NexusError("Missing character state in line %s." % options)
975
976                while True:
977                    # Make sure current state does not exceed number of
978                    # available symbols
979                    if len(self.statelabels[identifier]) > len(self.symbols):
980                        raise NexusError(
981                            "Character states exceed number of available symbols in line %s."
982                            % options
983                        )
984
985                    # Add the character state to the statelabels
986                    self.statelabels[identifier].append(state)
987
988                    # Check for another state or comma to end states (last
989                    # character should not have comma at end of states - but
990                    # we'll ignore)
991                    state = quotestrip(opts.next_word())
992
993                    if state is None:
994                        return
995                    elif state == ",":
996                        break
997
998    def _statelabels(self, options):
999        # self.charlabels = options
1000        # print("Command statelabels is not supported and will be ignored.")
1001        pass
1002
1003    def _matrix(self, options):
1004        """Create a matrix for NEXUS object (PRIVATE)."""
1005        if not self.ntax or not self.nchar:
1006            raise NexusError("Dimensions must be specified before matrix!")
1007        self.matrix = {}
1008        taxcount = 0
1009        first_matrix_block = True
1010
1011        # eliminate empty lines and leading/trailing whitespace
1012        lines = [_.strip() for _ in options.split("\n") if _.strip() != ""]
1013        lineiter = iter(lines)
1014        while True:
1015            try:
1016                line = next(lineiter)
1017            except StopIteration:
1018                if taxcount < self.ntax:
1019                    raise NexusError("Not enough taxa in matrix.") from None
1020                elif taxcount > self.ntax:
1021                    raise NexusError("Too many taxa in matrix.") from None
1022                else:
1023                    break
1024            # count the taxa and check for interleaved matrix
1025            taxcount += 1
1026            if taxcount > self.ntax:
1027                if not self.interleave:
1028                    raise NexusError(
1029                        "Too many taxa in matrix - should matrix be interleaved?"
1030                    )
1031                else:
1032                    taxcount = 1
1033                    first_matrix_block = False
1034            # get taxon name and sequence
1035            linechars = CharBuffer(line)
1036            id = quotestrip(linechars.next_word())
1037            line = linechars.rest().strip()
1038            chars = ""
1039            if self.interleave:
1040                # interleaved matrix
1041                if line:
1042                    chars = "".join(line.split())
1043                else:
1044                    chars = "".join(next(lineiter).split())
1045            else:
1046                # non-interleaved matrix
1047                chars = "".join(line.split())
1048                while len(chars) < self.nchar:
1049                    line = next(lineiter)
1050                    chars += "".join(line.split())
1051
1052            # Reformat sequence for non-standard datatypes
1053            if self.datatype != "standard":
1054                iupac_seq = Seq(
1055                    _replace_parenthesized_ambigs(chars, self.rev_ambiguous_values),
1056                )
1057                # first taxon has the reference sequence if matchhar is used
1058                if taxcount == 1:
1059                    refseq = iupac_seq
1060                else:
1061                    if self.matchchar:
1062                        while True:
1063                            p = iupac_seq.find(self.matchchar)
1064                            if p == -1:
1065                                break
1066                            iupac_seq = Seq(
1067                                iupac_seq[:p] + refseq[p] + iupac_seq[p + 1 :]
1068                            )
1069
1070                # Check for invalid characters
1071                for i, c in enumerate(iupac_seq):
1072                    if (
1073                        c not in self.valid_characters
1074                        and c != self.gap
1075                        and c != self.missing
1076                    ):
1077                        raise NexusError(
1078                            "Taxon %s: Illegal character %s in sequence %s "
1079                            "(check dimensions/interleaving)" % (id, c, iupac_seq)
1080                        )
1081            else:
1082                iupac_seq = StandardData(chars)
1083
1084                # Check for invalid characters
1085                for i, c in enumerate(iupac_seq):
1086                    # Go through each coding for each character
1087                    for coding in c["d"]:
1088                        if coding not in self.valid_characters:
1089                            if coding != self.gap and coding != self.missing:
1090                                raise NexusError(
1091                                    "Taxon %s: Illegal character %s in sequence %s "
1092                                    "(check dimensions/interleaving)"
1093                                    % (id, coding, iupac_seq)
1094                                )
1095
1096            # add sequence to matrix
1097            if first_matrix_block:
1098                self.unaltered_taxlabels.append(id)
1099                id = _unique_label(list(self.matrix.keys()), id)
1100                self.matrix[id] = iupac_seq
1101                self.taxlabels.append(id)
1102            else:
1103                # taxon names need to be in the same order in each interleaved block
1104                id = _unique_label(self.taxlabels[: taxcount - 1], id)
1105                taxon_present = self._check_taxlabels(id)
1106                if taxon_present:
1107                    self.matrix[taxon_present] += iupac_seq
1108                else:
1109                    raise NexusError(
1110                        "Taxon %s not in first block of interleaved "
1111                        "matrix. Check matrix dimensions and interleave." % id
1112                    )
1113        # check all sequences for length according to nchar
1114        for taxon in self.matrix:
1115            if len(self.matrix[taxon]) != self.nchar:
1116                raise NexusError(
1117                    "Matrix Nchar %d does not match data length (%d) for taxon %s"
1118                    % (self.nchar, len(self.matrix[taxon]), taxon)
1119                )
1120        # check that taxlabels is identical with matrix.keys. If not, it's a problem
1121        matrixkeys = sorted(self.matrix)
1122        taxlabelssort = sorted(self.taxlabels[:])
1123        if matrixkeys != taxlabelssort:
1124            raise ValueError(
1125                "ERROR: TAXLABELS must be identical with MATRIX. "
1126                "Please Report this as a bug, and send in data file."
1127            )
1128
1129    def _translate(self, options):
1130        """Translate a Nexus file (PRIVATE)."""
1131        self.translate = {}
1132        opts = CharBuffer(options)
1133        while True:
1134            try:
1135                # get id and state
1136                identifier = int(opts.next_word())
1137                label = quotestrip(opts.next_word())
1138                self.translate[identifier] = label
1139                # check for comma or end of command
1140                c = opts.next_nonwhitespace()
1141                if c is None:
1142                    break
1143                elif c != ",":
1144                    raise NexusError("Missing ',' in line %s." % options)
1145            except NexusError:
1146                raise
1147            except Exception:  # TODO: ValueError?
1148                raise NexusError("Format error in line %s." % options) from None
1149
1150    def _utree(self, options):
1151        """Use 'utree' to denote an unrooted tree (ex: clustalx) (PRIVATE)."""
1152        self._tree(options)
1153
1154    def _tree(self, options):
1155        opts = CharBuffer(options)
1156        if opts.peek_nonwhitespace() == "*":
1157            # a star can be used to make it the default tree in some software packages
1158            dummy = opts.next_nonwhitespace()
1159        name = opts.next_word()
1160        if opts.next_nonwhitespace() != "=":
1161            raise NexusError("Syntax error in tree description: %s" % options[:50])
1162        rooted = False
1163        weight = 1.0
1164        while opts.peek_nonwhitespace() == "[":
1165            opts.next_nonwhitespace()  # discard opening bracket
1166            symbol = next(opts)
1167            if symbol != "&":
1168                raise NexusError(
1169                    "Illegal special comment [%s...] in tree description: %s"
1170                    % (symbol, options[:50])
1171                )
1172            special = next(opts)
1173            value = opts.next_until("]")
1174            next(opts)  # discard closing bracket
1175            if special == "R":
1176                rooted = True
1177            elif special == "U":
1178                rooted = False
1179            elif special == "W":
1180                weight = float(value)
1181        tree = Tree(name=name, weight=weight, rooted=rooted, tree=opts.rest().strip())
1182        # if there's an active translation table, translate
1183        if self.translate:
1184            for n in tree.get_terminals():
1185                try:
1186                    tree.node(n).data.taxon = safename(
1187                        self.translate[int(tree.node(n).data.taxon)]
1188                    )
1189                except (ValueError, KeyError):
1190                    raise NexusError(
1191                        "Unable to substitute %s using 'translate' data."
1192                        % tree.node(n).data.taxon
1193                    ) from None
1194        self.trees.append(tree)
1195
1196    def _apply_block_structure(self, title, lines):
1197        """Apply Block structure to the NEXUS file (PRIVATE)."""
1198        block = Block("")
1199        block.title = title
1200        for line in lines:
1201            block.commandlines.append(Commandline(line, title))
1202        self.structured.append(block)
1203
1204    def _taxset(self, options):
1205        """Create unique taxset (PRIVATE)."""
1206        name, taxa = self._get_indices(options, set_type=TAXSET)
1207        self.taxsets[name] = _make_unique(taxa)
1208
1209    def _charset(self, options):
1210        """Create unique character set (PRIVATE)."""
1211        name, sites = self._get_indices(options, set_type=CHARSET)
1212        self.charsets[name] = _make_unique(sites)
1213
1214    def _taxpartition(self, options):
1215        """Collect taxpartition from a NEXUS file (PRIVATE)."""
1216        taxpartition = {}
1217        quotelevel = False
1218        opts = CharBuffer(options)
1219        name = self._name_n_vector(opts)
1220        if not name:
1221            raise NexusError("Formatting error in taxpartition: %s " % options)
1222        # now collect thesubbpartitions and parse them
1223        # subpartitons separated by commas - which unfortunately could be part of a quoted identifier...
1224        # this is rather unelegant, but we have to avoid double-parsing and potential change of special nexus-words
1225        sub = ""
1226        while True:
1227            w = next(opts)
1228            if w is None or (w == "," and not quotelevel):
1229                subname, subindices = self._get_indices(
1230                    sub, set_type=TAXSET, separator=":"
1231                )
1232                taxpartition[subname] = _make_unique(subindices)
1233                sub = ""
1234                if w is None:
1235                    break
1236            else:
1237                if w == "'":
1238                    quotelevel = not quotelevel
1239                sub += w
1240        self.taxpartitions[name] = taxpartition
1241
1242    def _codonposset(self, options):
1243        """Read codon positions from a codons block as written from McClade (PRIVATE).
1244
1245        Here codonposset is just a fancy name for a character partition with
1246        the name CodonPositions and the partitions N,1,2,3
1247        """
1248        prev_partitions = list(self.charpartitions.keys())
1249        self._charpartition(options)
1250        # mcclade calls it CodonPositions, but you never know...
1251        codonname = [n for n in self.charpartitions if n not in prev_partitions]
1252        if codonname == [] or len(codonname) > 1:
1253            raise NexusError("Formatting Error in codonposset: %s " % options)
1254        else:
1255            self.codonposset = codonname[0]
1256
1257    def _codeset(self, options):
1258        pass
1259
1260    def _charpartition(self, options):
1261        """Collect character partition from NEXUS file (PRIVATE)."""
1262        charpartition = {}
1263        quotelevel = False
1264        opts = CharBuffer(options)
1265        name = self._name_n_vector(opts)
1266        if not name:
1267            raise NexusError("Formatting error in charpartition: %s " % options)
1268        # now collect the subpartitions and parse them
1269        # subpartitions separated by commas - which unfortunately could be part
1270        # of a quoted identifier...
1271        sub = ""
1272        while True:
1273            w = next(opts)
1274            if w is None or (w == "," and not quotelevel):
1275                subname, subindices = self._get_indices(
1276                    sub, set_type=CHARSET, separator=":"
1277                )
1278                charpartition[subname] = _make_unique(subindices)
1279                sub = ""
1280                if w is None:
1281                    break
1282            else:
1283                if w == "'":
1284                    quotelevel = not quotelevel
1285                sub += w
1286        self.charpartitions[name] = charpartition
1287
1288    def _get_indices(self, options, set_type=CHARSET, separator="="):
1289        r"""Parse the taxset/charset specification (PRIVATE).
1290
1291        e.g. '1 2   3 - 5 dog cat   10 - 20 \\ 3'
1292        --> [0,1,2,3,4,'dog','cat',9,12,15,18]
1293        """
1294        opts = CharBuffer(options)
1295        name = self._name_n_vector(opts, separator=separator)
1296        indices = self._parse_list(opts, set_type=set_type)
1297        if indices is None:
1298            raise NexusError("Formatting error in line: %s " % options)
1299        return name, indices
1300
1301    def _name_n_vector(self, opts, separator="="):
1302        """Extract name and check that it's not in vector format (PRIVATE)."""
1303        rest = opts.rest()
1304        name = opts.next_word()
1305        # we ignore * before names
1306        if name == "*":
1307            name = opts.next_word()
1308        if not name:
1309            raise NexusError("Formatting error in line: %s " % rest)
1310        name = quotestrip(name)
1311        if opts.peek_nonwhitespace == "(":
1312            open = opts.next_nonwhitespace()
1313            qualifier = open.next_word()
1314            close = opts.next_nonwhitespace()
1315            if qualifier.lower() == "vector":
1316                raise NexusError("Unsupported VECTOR format in line %s" % (opts))
1317            elif qualifier.lower() != "standard":
1318                raise NexusError("Unknown qualifier %s in line %s" % (qualifier, opts))
1319        if opts.next_nonwhitespace() != separator:
1320            raise NexusError("Formatting error in line: %s " % rest)
1321        return name
1322
1323    def _parse_list(self, options_buffer, set_type):
1324        r"""Parse a NEXUS list (PRIVATE).
1325
1326        e.g. [1, 2, 4-8\\2, dog, cat] --> [1,2,4,6,8,17,21],
1327        (assuming dog is taxon no. 17 and cat is taxon no. 21).
1328        """
1329        plain_list = []
1330        if options_buffer.peek_nonwhitespace():
1331            try:
1332                # capture all possible exceptions and treat them as formatting
1333                # errors, if they are not NexusError
1334                while True:
1335                    identifier = options_buffer.next_word()  # next list element
1336                    if not identifier:  # end of list?
1337                        break
1338                    start = self._resolve(identifier, set_type=set_type)
1339                    if options_buffer.peek_nonwhitespace() == "-":  # followd by -
1340                        end = start
1341                        step = 1
1342                        # get hyphen and end of range
1343                        hyphen = options_buffer.next_nonwhitespace()
1344                        end = self._resolve(
1345                            options_buffer.next_word(), set_type=set_type
1346                        )
1347                        if set_type == CHARSET:
1348                            if (
1349                                options_buffer.peek_nonwhitespace() == "\\"
1350                            ):  # followd by \
1351                                backslash = options_buffer.next_nonwhitespace()
1352                                step = int(
1353                                    options_buffer.next_word()
1354                                )  # get backslash and step
1355                            plain_list.extend(range(start, end + 1, step))
1356                        else:
1357                            if isinstance(start, list) or isinstance(end, list):
1358                                raise NexusError(
1359                                    "Name if character sets not allowed in range definition: %s"
1360                                    % identifier
1361                                )
1362                            start = self.taxlabels.index(start)
1363                            end = self.taxlabels.index(end)
1364                            taxrange = self.taxlabels[start : end + 1]
1365                            plain_list.extend(taxrange)
1366                    else:
1367                        if isinstance(start, list):
1368                            # start was the name of charset or taxset
1369                            plain_list.extend(start)
1370                        else:
1371                            # start was an ordinary identifier
1372                            plain_list.append(start)
1373            except NexusError:
1374                raise
1375            except Exception:  # FIXME - this seems unwise
1376                return None
1377        return plain_list
1378
1379    def _resolve(self, identifier, set_type=None):
1380        """Translate identifier in list into character/taxon index (PRIVATE).
1381
1382        Characters (which are referred to by their index in Nexus.py):
1383            Plain numbers are returned minus 1 (Nexus indices to python indices)
1384            Text identifiers are translated into their indices (if plain character identifiers),
1385            the first hit in charlabels is returned (charlabels don't need to be unique)
1386            or the range of indices is returned (if names of character sets).
1387        Taxa (which are referred to by their unique name in Nexus.py):
1388            Plain numbers are translated in their taxon name, underscores and spaces are considered equal.
1389            Names are returned unchanged (if plain taxon identifiers), or the names in
1390            the corresponding taxon set is returned.
1391
1392        """
1393        identifier = quotestrip(identifier)
1394        if not set_type:
1395            raise NexusError("INTERNAL ERROR: Need type to resolve identifier.")
1396        if set_type == CHARSET:
1397            try:
1398                n = int(identifier)
1399            except ValueError:
1400                if self.charlabels and identifier in self.charlabels.values():
1401                    for k in self.charlabels:
1402                        if self.charlabels[k] == identifier:
1403                            return k
1404                elif self.charsets and identifier in self.charsets:
1405                    return self.charsets[identifier]
1406                else:
1407                    raise NexusError(
1408                        "Unknown character identifier: %s" % identifier
1409                    ) from None
1410            else:
1411                if n <= self.nchar:
1412                    return n - 1
1413                else:
1414                    raise NexusError(
1415                        "Illegal character identifier: %d>nchar (=%d)."
1416                        % (identifier, self.nchar)
1417                    )
1418        elif set_type == TAXSET:
1419            try:
1420                n = int(identifier)
1421            except ValueError:
1422                taxlabels_id = self._check_taxlabels(identifier)
1423                if taxlabels_id:
1424                    return taxlabels_id
1425                elif self.taxsets and identifier in self.taxsets:
1426                    return self.taxsets[identifier]
1427                else:
1428                    raise NexusError(
1429                        "Unknown taxon identifier: %s" % identifier
1430                    ) from None
1431            else:
1432                if n > 0 and n <= self.ntax:
1433                    return self.taxlabels[n - 1]
1434                else:
1435                    raise NexusError(
1436                        "Illegal taxon identifier: %d>ntax (=%d)."
1437                        % (identifier, self.ntax)
1438                    )
1439        else:
1440            raise NexusError("Unknown set specification: %s." % set_type)
1441
1442    def _stateset(self, options):
1443        # Not implemented
1444        pass
1445
1446    def _changeset(self, options):
1447        # Not implemented
1448        pass
1449
1450    def _treeset(self, options):
1451        # Not implemented
1452        pass
1453
1454    def _treepartition(self, options):
1455        # Not implemented
1456        pass
1457
1458    def write_nexus_data_partitions(
1459        self,
1460        matrix=None,
1461        filename=None,
1462        blocksize=None,
1463        interleave=False,
1464        exclude=(),
1465        delete=(),
1466        charpartition=None,
1467        comment="",
1468        mrbayes=False,
1469    ):
1470        """Write a nexus file for each partition in charpartition.
1471
1472        Only non-excluded characters and non-deleted taxa are included,
1473        just the data block is written.
1474        """
1475        if not matrix:
1476            matrix = self.matrix
1477        if not matrix:
1478            return
1479        if not filename:
1480            filename = self.filename
1481        if charpartition:
1482            pfilenames = {}
1483            for p in charpartition:
1484                total_exclude = list(exclude)
1485                total_exclude.extend(
1486                    c for c in range(self.nchar) if c not in charpartition[p]
1487                )
1488                total_exclude = _make_unique(total_exclude)
1489                pcomment = comment + "\nPartition: " + p + "\n"
1490                dot = filename.rfind(".")
1491                if dot > 0:
1492                    pfilename = filename[:dot] + "_" + p + ".data"
1493                else:
1494                    pfilename = filename + "_" + p
1495                pfilenames[p] = pfilename
1496                self.write_nexus_data(
1497                    filename=pfilename,
1498                    matrix=matrix,
1499                    blocksize=blocksize,
1500                    interleave=interleave,
1501                    exclude=total_exclude,
1502                    delete=delete,
1503                    comment=pcomment,
1504                    append_sets=False,
1505                    mrbayes=mrbayes,
1506                )
1507            return pfilenames
1508        else:
1509            fn = self.filename + ".data"
1510            self.write_nexus_data(
1511                filename=fn,
1512                matrix=matrix,
1513                blocksize=blocksize,
1514                interleave=interleave,
1515                exclude=exclude,
1516                delete=delete,
1517                comment=comment,
1518                append_sets=False,
1519                mrbayes=mrbayes,
1520            )
1521            return fn
1522
1523    def write_nexus_data(
1524        self,
1525        filename=None,
1526        matrix=None,
1527        exclude=(),
1528        delete=(),
1529        blocksize=None,
1530        interleave=False,
1531        interleave_by_partition=False,
1532        comment=None,
1533        omit_NEXUS=False,
1534        append_sets=True,
1535        mrbayes=False,
1536        codons_block=True,
1537    ):
1538        """Write a nexus file with data and sets block to a file or handle.
1539
1540        Character sets and partitions are appended by default, and are
1541        adjusted according to excluded characters (i.e. character sets
1542        still point to the same sites (not necessarily same positions),
1543        without including the deleted characters.
1544
1545        - filename - Either a filename as a string (which will be opened,
1546          written to and closed), or a handle object (which will
1547          be written to but NOT closed).
1548        - interleave_by_partition - Optional name of partition (string)
1549        - omit_NEXUS - Boolean.  If true, the '#NEXUS' line normally at the
1550          start of the file is omitted.
1551
1552        Returns the filename/handle used to write the data.
1553        """
1554        if not matrix:
1555            matrix = self.matrix
1556        if not matrix:
1557            return
1558        if not filename:
1559            filename = self.filename
1560        if [t for t in delete if not self._check_taxlabels(t)]:
1561            raise NexusError(
1562                "Unknown taxa: %s"
1563                % ", ".join(set(delete).difference(set(self.taxlabels)))
1564            )
1565        if interleave_by_partition:
1566            if interleave_by_partition not in self.charpartitions:
1567                raise NexusError("Unknown partition: %r" % interleave_by_partition)
1568            else:
1569                partition = self.charpartitions[interleave_by_partition]
1570                # we need to sort the partition names by starting position
1571                # before we exclude characters
1572                names = _sort_keys_by_values(partition)
1573                newpartition = {}
1574                for p in partition:
1575                    newpartition[p] = [c for c in partition[p] if c not in exclude]
1576        # how many taxa and how many characters are left?
1577        undelete = [
1578            taxon for taxon in self.taxlabels if taxon in matrix and taxon not in delete
1579        ]
1580        cropped_matrix = _seqmatrix2strmatrix(
1581            self.crop_matrix(matrix, exclude=exclude, delete=delete)
1582        )
1583        ntax_adjusted = len(undelete)
1584        nchar_adjusted = len(cropped_matrix[undelete[0]])
1585        if not undelete or (undelete and undelete[0] == ""):
1586            return
1587
1588        with File.as_handle(filename, mode="w") as fh:
1589            if not omit_NEXUS:
1590                fh.write("#NEXUS\n")
1591            if comment:
1592                fh.write("[" + comment + "]\n")
1593            fh.write("begin data;\n")
1594            fh.write("dimensions ntax=%d nchar=%d;\n" % (ntax_adjusted, nchar_adjusted))
1595            fh.write("format datatype=" + self.datatype)
1596            if self.respectcase:
1597                fh.write(" respectcase")
1598            if self.missing:
1599                fh.write(" missing=" + self.missing)
1600            if self.gap:
1601                fh.write(" gap=" + self.gap)
1602            if self.matchchar:
1603                fh.write(" matchchar=" + self.matchchar)
1604            if self.labels:
1605                fh.write(" labels=" + self.labels)
1606            if self.equate:
1607                fh.write(" equate=" + self.equate)
1608            if interleave or interleave_by_partition:
1609                fh.write(" interleave")
1610            fh.write(";\n")
1611            # if self.taxlabels:
1612            #    fh.write('taxlabels '+' '.join(self.taxlabels)+';\n')
1613            if self.charlabels:
1614                newcharlabels = self._adjust_charlabels(exclude=exclude)
1615                clkeys = sorted(newcharlabels)
1616                fh.write(
1617                    "charlabels "
1618                    + ", ".join(
1619                        "%s %s" % (k + 1, safename(newcharlabels[k])) for k in clkeys
1620                    )
1621                    + ";\n"
1622                )
1623            fh.write("matrix\n")
1624            if not blocksize:
1625                if interleave:
1626                    blocksize = 70
1627                else:
1628                    blocksize = self.nchar
1629            # delete deleted taxa and ecxclude excluded characters...
1630            namelength = max(len(safename(t, mrbayes=mrbayes)) for t in undelete)
1631            if interleave_by_partition:
1632                # interleave by partitions, but adjust partitions with regard
1633                # to excluded characters
1634                seek = 0
1635                for p in names:
1636                    fh.write("[%s: %s]\n" % (interleave_by_partition, p))
1637                    if len(newpartition[p]) > 0:
1638                        for taxon in undelete:
1639                            fh.write(
1640                                safename(taxon, mrbayes=mrbayes).ljust(namelength + 1)
1641                            )
1642                            fh.write(
1643                                cropped_matrix[taxon][
1644                                    seek : seek + len(newpartition[p])
1645                                ]
1646                                + "\n"
1647                            )
1648                        fh.write("\n")
1649                    else:
1650                        fh.write("[empty]\n\n")
1651                    seek += len(newpartition[p])
1652            elif interleave:
1653                for seek in range(0, nchar_adjusted, blocksize):
1654                    for taxon in undelete:
1655                        fh.write(safename(taxon, mrbayes=mrbayes).ljust(namelength + 1))
1656                        fh.write(cropped_matrix[taxon][seek : seek + blocksize] + "\n")
1657                    fh.write("\n")
1658            else:
1659                for taxon in undelete:
1660                    if blocksize < nchar_adjusted:
1661                        fh.write(safename(taxon, mrbayes=mrbayes) + "\n")
1662                    else:
1663                        fh.write(safename(taxon, mrbayes=mrbayes).ljust(namelength + 1))
1664                    taxon_seq = cropped_matrix[taxon]
1665                    for seek in range(0, nchar_adjusted, blocksize):
1666                        fh.write(taxon_seq[seek : seek + blocksize] + "\n")
1667                    del taxon_seq
1668            fh.write(";\nend;\n")
1669            if append_sets:
1670                if codons_block:
1671                    fh.write(
1672                        self.append_sets(
1673                            exclude=exclude,
1674                            delete=delete,
1675                            mrbayes=mrbayes,
1676                            include_codons=False,
1677                        )
1678                    )
1679                    fh.write(
1680                        self.append_sets(
1681                            exclude=exclude,
1682                            delete=delete,
1683                            mrbayes=mrbayes,
1684                            codons_only=True,
1685                        )
1686                    )
1687                else:
1688                    fh.write(
1689                        self.append_sets(
1690                            exclude=exclude, delete=delete, mrbayes=mrbayes
1691                        )
1692                    )
1693        return filename
1694
1695    def append_sets(
1696        self,
1697        exclude=(),
1698        delete=(),
1699        mrbayes=False,
1700        include_codons=True,
1701        codons_only=False,
1702    ):
1703        """Return a sets block."""
1704        if not self.charsets and not self.taxsets and not self.charpartitions:
1705            return ""
1706        if codons_only:
1707            setsb = ["\nbegin codons"]
1708        else:
1709            setsb = ["\nbegin sets"]
1710        # - now if characters have been excluded, the character sets need to be adjusted,
1711        #   so that they still point to the right character positions
1712        # calculate a list of offsets: for each deleted character, the following character position
1713        # in the new file will have an additional offset of -1
1714        offset = 0
1715        offlist = []
1716        for c in range(self.nchar):
1717            if c in exclude:
1718                offset += 1
1719                offlist.append(
1720                    -1
1721                )  # dummy value as these character positions are excluded
1722            else:
1723                offlist.append(c - offset)
1724        # now adjust each of the character sets
1725        if not codons_only:
1726            for n, ns in self.charsets.items():
1727                cset = [offlist[c] for c in ns if c not in exclude]
1728                if cset:
1729                    setsb.append(
1730                        "charset %s = %s" % (safename(n), _compact4nexus(cset))
1731                    )
1732            for n, s in self.taxsets.items():
1733                tset = [safename(t, mrbayes=mrbayes) for t in s if t not in delete]
1734                if tset:
1735                    setsb.append("taxset %s = %s" % (safename(n), " ".join(tset)))
1736        for n, p in self.charpartitions.items():
1737            if not include_codons and n == CODONPOSITIONS:
1738                continue
1739            elif codons_only and n != CODONPOSITIONS:
1740                continue
1741            # as characters have been excluded, the partitions must be adjusted
1742            # if a partition is empty, it will be omitted from the charpartition command
1743            # (although paup allows charpartition part=t1:,t2:,t3:1-100)
1744            names = _sort_keys_by_values(p)
1745            newpartition = {}
1746            for sn in names:
1747                nsp = [offlist[c] for c in p[sn] if c not in exclude]
1748                if nsp:
1749                    newpartition[sn] = nsp
1750            if newpartition:
1751                if include_codons and n == CODONPOSITIONS:
1752                    command = "codonposset"
1753                else:
1754                    command = "charpartition"
1755                setsb.append(
1756                    "%s %s = %s"
1757                    % (
1758                        command,
1759                        safename(n),
1760                        ", ".join(
1761                            "%s: %s" % (sn, _compact4nexus(newpartition[sn]))
1762                            for sn in names
1763                            if sn in newpartition
1764                        ),
1765                    )
1766                )
1767        # now write charpartititions, much easier than charpartitions
1768        for n, p in self.taxpartitions.items():
1769            names = _sort_keys_by_values(p)
1770            newpartition = {}
1771            for sn in names:
1772                nsp = [t for t in p[sn] if t not in delete]
1773                if nsp:
1774                    newpartition[sn] = nsp
1775            if newpartition:
1776                setsb.append(
1777                    "taxpartition %s = %s"
1778                    % (
1779                        safename(n),
1780                        ", ".join(
1781                            "%s: %s"
1782                            % (
1783                                safename(sn),
1784                                " ".join(safename(x) for x in newpartition[sn]),
1785                            )
1786                            for sn in names
1787                            if sn in newpartition
1788                        ),
1789                    )
1790                )
1791        # add 'end' and return everything
1792        setsb.append("end;\n")
1793        if len(setsb) == 2:  # begin and end only
1794            return ""
1795        else:
1796            return ";\n".join(setsb)
1797
1798    def export_fasta(self, filename=None, width=70):
1799        """Write matrix into a fasta file."""
1800        if not filename:
1801            if "." in self.filename and self.filename.split(".")[-1].lower() in [
1802                "paup",
1803                "nexus",
1804                "nex",
1805                "dat",
1806            ]:
1807                filename = ".".join(self.filename.split(".")[:-1]) + ".fas"
1808            else:
1809                filename = self.filename + ".fas"
1810        with open(filename, "w") as fh:
1811            for taxon in self.taxlabels:
1812                fh.write(">" + safename(taxon) + "\n")
1813                for i in range(0, len(str(self.matrix[taxon])), width):
1814                    fh.write(str(self.matrix[taxon])[i : i + width] + "\n")
1815        return filename
1816
1817    def export_phylip(self, filename=None):
1818        """Write matrix into a PHYLIP file.
1819
1820        Note that this writes a relaxed PHYLIP format file, where the names
1821        are not truncated, nor checked for invalid characters.
1822        """
1823        if not filename:
1824            if "." in self.filename and self.filename.split(".")[-1].lower() in [
1825                "paup",
1826                "nexus",
1827                "nex",
1828                "dat",
1829            ]:
1830                filename = ".".join(self.filename.split(".")[:-1]) + ".phy"
1831            else:
1832                filename = self.filename + ".phy"
1833        with open(filename, "w") as fh:
1834            fh.write("%d %d\n" % (self.ntax, self.nchar))
1835            for taxon in self.taxlabels:
1836                fh.write("%s %s\n" % (safename(taxon), str(self.matrix[taxon])))
1837        return filename
1838
1839    def constant(self, matrix=None, delete=(), exclude=()):
1840        """Return a list with all constant characters."""
1841        if not matrix:
1842            matrix = self.matrix
1843        undelete = [t for t in self.taxlabels if t in matrix and t not in delete]
1844        if not undelete:
1845            return None
1846        elif len(undelete) == 1:
1847            return [x for x in range(len(matrix[undelete[0]])) if x not in exclude]
1848        # get the first sequence and expand all ambiguous values
1849        constant = [
1850            (x, self.ambiguous_values.get(n.upper(), n.upper()))
1851            for x, n in enumerate(str(matrix[undelete[0]]))
1852            if x not in exclude
1853        ]
1854
1855        for taxon in undelete[1:]:
1856            newconstant = []
1857            for site in constant:
1858                # print("%d (paup=%d)" % (site[0],site[0]+1), end="")
1859                seqsite = matrix[taxon][site[0]].upper()
1860                # print(seqsite,"checked against",site[1],"\t", end="")
1861                if (
1862                    seqsite == self.missing
1863                    or (
1864                        seqsite == self.gap
1865                        and self.options["gapmode"].lower() == "missing"
1866                    )
1867                    or seqsite == site[1]
1868                ):
1869                    # missing or same as before  -> ok
1870                    newconstant.append(site)
1871                elif (
1872                    seqsite in site[1]
1873                    or site[1] == self.missing
1874                    or (
1875                        self.options["gapmode"].lower() == "missing"
1876                        and site[1] == self.gap
1877                    )
1878                ):
1879                    # subset of an ambig or only missing in previous -> take subset
1880                    newconstant.append(
1881                        (site[0], self.ambiguous_values.get(seqsite, seqsite))
1882                    )
1883                elif seqsite in self.ambiguous_values:
1884                    # is it an ambig: check the intersection with prev. values
1885                    intersect = set(self.ambiguous_values[seqsite]).intersection(
1886                        set(site[1])
1887                    )
1888                    if intersect:
1889                        newconstant.append((site[0], "".join(intersect)))
1890                    #    print("ok")
1891                    # else:
1892                    #    print("failed")
1893                # else:
1894                #    print("failed")
1895            constant = newconstant
1896        cpos = [s[0] for s in constant]
1897        return cpos
1898
1899    def cstatus(self, site, delete=(), narrow=True):
1900        """Summarize character.
1901
1902        narrow=True:  paup-mode (a c ? --> ac; ? ? ? --> ?)
1903        narrow=false:           (a c ? --> a c g t -; ? ? ? --> a c g t -)
1904        """
1905        undelete = [t for t in self.taxlabels if t not in delete]
1906        if not undelete:
1907            return None
1908        cstatus = []
1909        for t in undelete:
1910            c = self.matrix[t][site].upper()
1911            if self.options.get("gapmode") == "missing" and c == self.gap:
1912                c = self.missing
1913            if narrow and c == self.missing:
1914                if c not in cstatus:
1915                    cstatus.append(c)
1916            else:
1917                cstatus.extend(b for b in self.ambiguous_values[c] if b not in cstatus)
1918        if self.missing in cstatus and narrow and len(cstatus) > 1:
1919            cstatus = [_ for _ in cstatus if _ != self.missing]
1920        cstatus.sort()
1921        return cstatus
1922
1923    def weighted_stepmatrix(self, name="your_name_here", exclude=(), delete=()):
1924        """Calculate a stepmatrix for weighted parsimony.
1925
1926        See Wheeler (1990), Cladistics 6:269-275 and
1927        Felsenstein (1981), Biol. J. Linn. Soc. 16:183-196
1928        """
1929        m = StepMatrix(self.unambiguous_letters, self.gap)
1930        for site in [s for s in range(self.nchar) if s not in exclude]:
1931            cstatus = self.cstatus(site, delete)
1932            for i, b1 in enumerate(cstatus[:-1]):
1933                for b2 in cstatus[i + 1 :]:
1934                    m.add(b1.upper(), b2.upper(), 1)
1935        return m.transformation().weighting().smprint(name=name)
1936
1937    def crop_matrix(self, matrix=None, delete=(), exclude=()):
1938        """Return a matrix without deleted taxa and excluded characters."""
1939        if not matrix:
1940            matrix = self.matrix
1941        if [t for t in delete if not self._check_taxlabels(t)]:
1942            raise NexusError(
1943                "Unknown taxa: %s" % ", ".join(set(delete).difference(self.taxlabels))
1944            )
1945        if exclude != []:
1946            undelete = [t for t in self.taxlabels if t in matrix and t not in delete]
1947            if not undelete:
1948                return {}
1949            m = [str(matrix[k]) for k in undelete]
1950            sitesm = [s for i, s in enumerate(zip(*m)) if i not in exclude]
1951            if sitesm == []:
1952                return {t: Seq("") for t in undelete}
1953            else:
1954                m = [Seq(s) for s in ("".join(x) for x in zip(*sitesm))]
1955                return dict(zip(undelete, m))
1956        else:
1957            return {
1958                t: matrix[t] for t in self.taxlabels if t in matrix and t not in delete
1959            }
1960
1961    def bootstrap(self, matrix=None, delete=(), exclude=()):
1962        """Return a bootstrapped matrix."""
1963        if not matrix:
1964            matrix = self.matrix
1965        seqobjects = isinstance(
1966            matrix[list(matrix.keys())[0]], Seq
1967        )  # remember if Seq objects
1968        cm = self.crop_matrix(delete=delete, exclude=exclude)  # crop data out
1969        if not cm:  # everything deleted?
1970            return {}
1971        elif not cm[list(cm.keys())[0]]:  # everything excluded?
1972            return cm
1973        undelete = [t for t in self.taxlabels if t in cm]
1974        if seqobjects:
1975            sitesm = list(zip(*[str(cm[t]) for t in undelete]))
1976        else:
1977            sitesm = list(zip(*[cm[t] for t in undelete]))
1978        bootstrapsitesm = [
1979            sitesm[random.randint(0, len(sitesm) - 1)] for _ in range(len(sitesm))
1980        ]
1981        bootstrapseqs = ["".join(x) for x in zip(*bootstrapsitesm)]
1982        if seqobjects:
1983            bootstrapseqs = [Seq(s) for s in bootstrapseqs]
1984        return dict(zip(undelete, bootstrapseqs))
1985
1986    def add_sequence(self, name, sequence):
1987        """Add a sequence (string) to the matrix."""
1988        if not name:
1989            raise NexusError("New sequence must have a name")
1990
1991        diff = self.nchar - len(sequence)
1992        if diff < 0:
1993            self.insert_gap(self.nchar, -diff)
1994        elif diff > 0:
1995            sequence += self.missing * diff
1996
1997        if name in self.taxlabels:
1998            unique_name = _unique_label(self.taxlabels, name)
1999            # print("WARNING: Sequence name %s is already present. Sequence was added as %s." % (name,unique_name))
2000        else:
2001            unique_name = name
2002
2003        if unique_name in self.matrix:
2004            raise ValueError(
2005                "ERROR. There is a discrepancy between taxlabels "
2006                "and matrix keys. Report this as a bug."
2007            )
2008
2009        self.matrix[unique_name] = Seq(sequence)
2010        self.ntax += 1
2011        self.taxlabels.append(unique_name)
2012        self.unaltered_taxlabels.append(name)
2013
2014    def insert_gap(self, pos, n=1, leftgreedy=False):
2015        """Add a gap into the matrix and adjust charsets and partitions.
2016
2017        pos=0: first position
2018        pos=nchar: last position
2019        """
2020
2021        def _adjust(set, x, d, leftgreedy=False):
2022            """Adjust character sets if gaps are inserted (PRIVATE).
2023
2024            Takes care of new gaps within a coherent character set.
2025            """
2026            # if 3 gaps are inserted at pos. 9 in a set that looks like 1 2 3  8 9 10 11 13 14 15
2027            # then the adjusted set will be 1 2 3  8 9 10 11 12 13 14 15 16 17 18
2028            # but inserting into position 8 it will stay like 1 2 3 11 12 13 14 15 16 17 18
2029            set.sort()
2030            addpos = 0
2031            for i, c in enumerate(set):
2032                if c >= x:
2033                    set[i] = c + d
2034                # if we add gaps within a group of characters, we want the gap position included in this group
2035                if c == x:
2036                    if leftgreedy or (i > 0 and set[i - 1] == c - 1):
2037                        addpos = i
2038            if addpos > 0:
2039                set[addpos:addpos] = list(range(x, x + d))
2040            return set
2041
2042        if pos < 0 or pos > self.nchar:
2043            raise NexusError("Illegal gap position: %d" % pos)
2044        if n == 0:
2045            return
2046        sitesm = list(zip(*[str(self.matrix[t]) for t in self.taxlabels]))
2047        sitesm[pos:pos] = [["-"] * len(self.taxlabels)] * n
2048        mapped = ["".join(x) for x in zip(*sitesm)]
2049        listed = [(taxon, Seq(mapped[i])) for i, taxon in enumerate(self.taxlabels)]
2050        self.matrix = dict(listed)
2051        self.nchar += n
2052        # now adjust character sets
2053        for i, s in self.charsets.items():
2054            self.charsets[i] = _adjust(s, pos, n, leftgreedy=leftgreedy)
2055        for p in self.charpartitions:
2056            for sp, s in self.charpartitions[p].items():
2057                self.charpartitions[p][sp] = _adjust(s, pos, n, leftgreedy=leftgreedy)
2058        # now adjust character state labels
2059        self.charlabels = self._adjust_charlabels(insert=[pos] * n)
2060        return self.charlabels
2061
2062    def _adjust_charlabels(self, exclude=None, insert=None):
2063        """Return adjusted indices of self.charlabels if characters are excluded or inserted (PRIVATE)."""
2064        if exclude and insert:
2065            raise NexusError("Can't exclude and insert at the same time")
2066        if not self.charlabels:
2067            return None
2068        labels = sorted(self.charlabels)
2069        newcharlabels = {}
2070        if exclude:
2071            exclude.sort()
2072            exclude.append(sys.maxsize)
2073            excount = 0
2074            for c in labels:
2075                if c not in exclude:
2076                    while c > exclude[excount]:
2077                        excount += 1
2078                    newcharlabels[c - excount] = self.charlabels[c]
2079        elif insert:
2080            insert.sort()
2081            insert.append(sys.maxsize)
2082            icount = 0
2083            for c in labels:
2084                while c >= insert[icount]:
2085                    icount += 1
2086                newcharlabels[c + icount] = self.charlabels[c]
2087        else:
2088            return self.charlabels
2089        return newcharlabels
2090
2091    def invert(self, charlist):
2092        """Return all character indices that are not in charlist."""
2093        return [c for c in range(self.nchar) if c not in charlist]
2094
2095    def gaponly(self, include_missing=False):
2096        """Return gap-only sites."""
2097        gap = set(self.gap)
2098        if include_missing:
2099            gap.add(self.missing)
2100        sitesm = zip(*[str(self.matrix[t]) for t in self.taxlabels])
2101        return [i for i, site in enumerate(sitesm) if set(site).issubset(gap)]
2102
2103    def terminal_gap_to_missing(self, missing=None, skip_n=True):
2104        """Replace all terminal gaps with missing character.
2105
2106        Mixtures like ???------??------- are properly resolved.
2107        """
2108        if not missing:
2109            missing = self.missing
2110        replace = [self.missing, self.gap]
2111        if not skip_n:
2112            replace.extend(["n", "N"])
2113        for taxon in self.taxlabels:
2114            sequence = str(self.matrix[taxon])
2115            length = len(sequence)
2116            start, end = get_start_end(sequence, skiplist=replace)
2117            if start == -1 and end == -1:
2118                sequence = missing * length
2119            else:
2120                sequence = sequence[: end + 1] + missing * (length - end - 1)
2121                sequence = start * missing + sequence[start:]
2122            if length != len(sequence):
2123                raise RuntimeError(
2124                    "Illegal sequence manipulation in "
2125                    "Nexus.terminal_gap_to_missing in taxon %s" % taxon
2126                )
2127            self.matrix[taxon] = Seq(sequence)
2128
2129
2130try:
2131    import cnexus
2132except ImportError:
2133
2134    def _get_command_lines(file_contents):
2135        lines = _kill_comments_and_break_lines(file_contents)
2136        commandlines = _adjust_lines(lines)
2137        return commandlines
2138
2139
2140else:
2141
2142    def _get_command_lines(file_contents):
2143        decommented = cnexus.scanfile(file_contents)
2144        # check for unmatched parentheses
2145        if decommented == "[" or decommented == "]":
2146            raise NexusError("Unmatched %s" % decommented)
2147        # cnexus can't return lists, so in analogy we separate
2148        # commandlines with chr(7) (a character that shouldn't be part of a
2149        # nexus file under normal circumstances)
2150        commandlines = _adjust_lines(decommented.split(chr(7)))
2151        return commandlines
2152
2153
2154if __name__ == "__main__":
2155    from Bio._utils import run_doctest
2156
2157    run_doctest()
2158