1# Copyright 2005-2008 by Frank Kauff & Cymon J. Cox. All rights reserved. 2# Revisions copyright 2014-2015 by Joe Cora (standard data) 3# 4# This file is part of the Biopython distribution and governed by your 5# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". 6# Please see the LICENSE file that should have been included as part of this 7# package. 8 9"""Nexus class. Parse the contents of a NEXUS file. 10 11Based upon 'NEXUS: An extensible file format for systematic information' 12Maddison, Swofford, Maddison. 1997. Syst. Biol. 46(4):590-621 13""" 14 15from functools import reduce 16import copy 17import math 18import random 19import sys 20 21from Bio import File 22from Bio.Data import IUPACData 23from Bio.Seq import Seq 24 25from Bio.Nexus.StandardData import StandardData 26from Bio.Nexus.Trees import Tree 27 28 29INTERLEAVE = 70 30SPECIAL_COMMANDS = [ 31 "charstatelabels", 32 "charlabels", 33 "taxlabels", 34 "taxset", 35 "charset", 36 "charpartition", 37 "taxpartition", 38 "matrix", 39 "tree", 40 "utree", 41 "translate", 42 "codonposset", 43 "title", 44] 45KNOWN_NEXUS_BLOCKS = ["trees", "data", "characters", "taxa", "sets", "codons"] 46PUNCTUATION = "()[]{}\\,;:=*\\'\"`+-<>" 47MRBAYESSAFE = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890_" 48WHITESPACE = " \t\n" 49# SPECIALCOMMENTS = ['!','&','%','/','\\','@'] # original list of special comments 50SPECIALCOMMENTS = [ 51 "&" 52] # supported special comment ('tree' command), all others are ignored 53CHARSET = "chars" 54TAXSET = "taxa" 55CODONPOSITIONS = "codonpositions" 56DEFAULTNEXUS = ( 57 "#NEXUS\nbegin data; dimensions ntax=0 nchar=0; format datatype=dna; end; " 58) 59 60 61class NexusError(Exception): 62 """Provision for the management of Nexus exceptions.""" 63 64 pass 65 66 67class CharBuffer: 68 """Helps reading NEXUS-words and characters from a buffer (semi-PRIVATE). 69 70 This class is not intended for public use (any more). 71 """ 72 73 def __init__(self, string): 74 """Initialize the class.""" 75 if string: 76 self.buffer = list(string) 77 else: 78 self.buffer = [] 79 80 def peek(self): 81 """Return the first character from the buffer.""" 82 if self.buffer: 83 return self.buffer[0] 84 else: 85 return None 86 87 def peek_nonwhitespace(self): 88 """Return the first character from the buffer, do not include spaces.""" 89 b = "".join(self.buffer).strip() 90 if b: 91 return b[0] 92 else: 93 return None 94 95 def __next__(self): 96 """Iterate over NEXUS characters in the file.""" 97 if self.buffer: 98 return self.buffer.pop(0) 99 else: 100 return None 101 102 def next_nonwhitespace(self): 103 """Check for next non whitespace character in NEXUS file.""" 104 while True: 105 p = next(self) 106 if p is None: 107 break 108 if p not in WHITESPACE: 109 return p 110 return None 111 112 def skip_whitespace(self): 113 """Skip whitespace characters in NEXUS file.""" 114 while self.buffer[0] in WHITESPACE: 115 self.buffer = self.buffer[1:] 116 117 def next_until(self, target): 118 """Iterate over the NEXUS file until a target character is reached.""" 119 for t in target: 120 try: 121 pos = self.buffer.index(t) 122 except ValueError: 123 pass 124 else: 125 found = "".join(self.buffer[:pos]) 126 self.buffer = self.buffer[pos:] 127 return found 128 else: 129 return None 130 131 def peek_word(self, word): 132 """Return a word stored in the buffer.""" 133 return "".join(self.buffer[: len(word)]) == word 134 135 def next_word(self): 136 """Return the next NEXUS word from a string. 137 138 This deals with single and double quotes, whitespace and punctuation. 139 """ 140 word = [] 141 quoted = False 142 # get first character 143 first = self.next_nonwhitespace() 144 if not first: 145 # return empty if only whitespace left 146 return None 147 word.append(first) 148 if first == "'": 149 quoted = "'" 150 elif first == '"': 151 quoted = '"' 152 elif first in PUNCTUATION: 153 # if it's non-quote punctuation, return immediately 154 return first 155 while True: 156 c = self.peek() 157 if c == quoted: # a quote? 158 word.append(next(self)) # store quote 159 if self.peek() == quoted: # double quote 160 next(self) # skip second quote 161 elif quoted: # second single quote ends word 162 break 163 elif quoted: 164 # if quoted, then add anything 165 word.append(next(self)) 166 elif not c or c in PUNCTUATION or c in WHITESPACE: 167 # if not quoted and special character, stop 168 break 169 else: 170 word.append(next(self)) # standard character 171 return "".join(word) 172 173 def rest(self): 174 """Return the rest of the string without parsing.""" 175 return "".join(self.buffer) 176 177 178class StepMatrix: 179 """Calculate a stepmatrix for weighted parsimony. 180 181 See : 182 COMBINATORIAL WEIGHTS IN PHYLOGENETIC ANALYSIS - A STATISTICAL PARSIMONY PROCEDURE 183 Wheeler (1990), Cladistics 6:269-275. 184 """ 185 186 def __init__(self, symbols, gap): 187 """Initialize the class.""" 188 self.data = {} 189 self.symbols = sorted(symbols) 190 if gap: 191 self.symbols.append(gap) 192 for x in self.symbols: 193 for y in [s for s in self.symbols if s != x]: 194 self.set(x, y, 0) 195 196 def set(self, x, y, value): 197 """Set a given value in the matrix's position.""" 198 if x > y: 199 x, y = y, x 200 self.data[x + y] = value 201 202 def add(self, x, y, value): 203 """Add the given value to existing, in matrix's position.""" 204 if x > y: 205 x, y = y, x 206 self.data[x + y] += value 207 208 def sum(self): 209 """Calculate the associations, makes matrix of associations.""" 210 return reduce(lambda x, y: x + y, self.data.values()) 211 212 def transformation(self): 213 """Calculate the transformation matrix. 214 215 Normalizes the columns of the matrix of associations. 216 """ 217 total = self.sum() 218 if total != 0: 219 for k in self.data: 220 self.data[k] = self.data[k] / float(total) 221 return self 222 223 def weighting(self): 224 """Calculate the Phylogenetic weight matrix. 225 226 Constructed from the logarithmic transformation of the 227 transformation matrix. 228 """ 229 for k in self.data: 230 if self.data[k] != 0: 231 self.data[k] = -math.log(self.data[k]) 232 return self 233 234 def smprint(self, name="your_name_here"): 235 """Print a stepmatrix.""" 236 matrix = "usertype %s stepmatrix=%d\n" % (name, len(self.symbols)) 237 matrix += " %s\n" % " ".join(self.symbols) 238 for x in self.symbols: 239 matrix += "[%s]".ljust(8) % x 240 for y in self.symbols: 241 if x == y: 242 matrix += " . " 243 else: 244 if x > y: 245 x1, y1 = y, x 246 else: 247 x1, y1 = x, y 248 if self.data[x1 + y1] == 0: 249 matrix += "inf. " 250 else: 251 matrix += "%2.2f".ljust(10) % (self.data[x1 + y1]) 252 matrix += "\n" 253 matrix += ";\n" 254 return matrix 255 256 257def safename(name, mrbayes=False): 258 """Return a taxon identifier according to NEXUS standard. 259 260 Wrap quotes around names with punctuation or whitespace, and double 261 single quotes. 262 263 mrbayes=True: write names without quotes, whitespace or punctuation 264 for the mrbayes software package. 265 """ 266 if mrbayes: 267 safe = name.replace(" ", "_") 268 safe = "".join(c for c in safe if c in MRBAYESSAFE) 269 else: 270 safe = name.replace("'", "''") 271 if set(safe).intersection(set(WHITESPACE + PUNCTUATION)): 272 safe = "'" + safe + "'" 273 return safe 274 275 276def quotestrip(word): 277 """Remove quotes and/or double quotes around identifiers.""" 278 if not word: 279 return None 280 while (word.startswith("'") and word.endswith("'")) or ( 281 word.startswith('"') and word.endswith('"') 282 ): 283 word = word[1:-1] 284 return word 285 286 287def get_start_end(sequence, skiplist=("-", "?")): 288 """Return position of first and last character which is not in skiplist. 289 290 Skiplist defaults to ['-','?']. 291 """ 292 length = len(sequence) 293 if length == 0: 294 return None, None 295 end = length - 1 296 while end >= 0 and (sequence[end] in skiplist): 297 end -= 1 298 start = 0 299 while start < length and (sequence[start] in skiplist): 300 start += 1 301 if start == length and end == -1: # empty sequence 302 return -1, -1 303 else: 304 return start, end 305 306 307def _sort_keys_by_values(p): 308 """Return a sorted list of keys of p sorted by values of p (PRIVATE).""" 309 return sorted((pn for pn in p if p[pn]), key=lambda pn: p[pn]) 310 311 312def _make_unique(l): 313 """Check all values in list are unique and return a pruned and sorted list (PRIVATE).""" 314 return sorted(set(l)) 315 316 317def _unique_label(previous_labels, label): 318 """Return a unique name if label is already in previous_labels (PRIVATE).""" 319 while label in previous_labels: 320 label_split = label.split(".") 321 if label_split[-1].startswith("copy"): 322 copy_num = 1 323 if label_split[-1] != "copy": 324 copy_num = int(label_split[-1][4:]) + 1 325 new_label = "%s.copy%s" % (".".join(label_split[:-1]), copy_num) 326 label = new_label 327 else: 328 label += ".copy" 329 return label 330 331 332def _seqmatrix2strmatrix(matrix): 333 """Convert a Seq-object matrix to a plain sequence-string matrix (PRIVATE).""" 334 return {t: str(matrix[t]) for t in matrix} 335 336 337def _compact4nexus(orig_list): 338 r"""Compact lists for Nexus output (PRIVATE). 339 340 Example 341 ------- 342 >>> _compact4nexus([1, 2, 3, 5, 6, 7, 8, 12, 15, 18, 20]) 343 '2-4 6-9 13-19\\3 21' 344 345 Transform [1 2 3 5 6 7 8 12 15 18 20] (baseindex 0, used in the Nexus class) 346 into '2-4 6-9 13-19\\3 21' (baseindex 1, used in programs like Paup or MrBayes.). 347 348 """ 349 if not orig_list: 350 return "" 351 orig_list = sorted(set(orig_list)) 352 shortlist = [] 353 clist = orig_list[:] 354 clist.append(clist[-1] + 0.5) # dummy value makes it easier 355 while len(clist) > 1: 356 step = 1 357 for i, x in enumerate(clist): 358 if x == clist[0] + i * step: # are we still in the right step? 359 continue 360 elif i == 1 and len(clist) > 3 and clist[i + 1] - x == x - clist[0]: 361 # second element, and possibly at least 3 elements to link, 362 # and the next one is in the right step 363 step = x - clist[0] 364 else: # pattern broke, add all values before current position to new list 365 sub = clist[:i] 366 if len(sub) == 1: 367 shortlist.append(str(sub[0] + 1)) 368 else: 369 if step == 1: 370 shortlist.append("%d-%d" % (sub[0] + 1, sub[-1] + 1)) 371 else: 372 shortlist.append("%d-%d\\%d" % (sub[0] + 1, sub[-1] + 1, step)) 373 clist = clist[i:] 374 break 375 return " ".join(shortlist) 376 377 378def combine(matrices): 379 """Combine matrices in [(name,nexus-instance),...] and return new nexus instance. 380 381 combined_matrix=combine([(name1,nexus_instance1),(name2,nexus_instance2),...] 382 Character sets, character partitions and taxon sets are prefixed, readjusted 383 and present in the combined matrix. 384 """ 385 if not matrices: 386 return None 387 name = matrices[0][0] 388 combined = copy.deepcopy(matrices[0][1]) # initiate with copy of first matrix 389 mixed_datatypes = len({n[1].datatype for n in matrices}) > 1 390 if mixed_datatypes: 391 # dealing with mixed matrices is application specific. 392 # You take care of that yourself! 393 combined.datatype = "None" 394 # raise NexusError('Matrices must be of same datatype') 395 combined.charlabels = None 396 combined.statelabels = None 397 combined.interleave = False 398 combined.translate = None 399 400 # rename taxon sets and character sets and name them with prefix 401 for cn, cs in combined.charsets.items(): 402 combined.charsets["%s.%s" % (name, cn)] = cs 403 del combined.charsets[cn] 404 for tn, ts in combined.taxsets.items(): 405 combined.taxsets["%s.%s" % (name, tn)] = ts 406 del combined.taxsets[tn] 407 # previous partitions usually don't make much sense in combined matrix 408 # just initiate one new partition parted by single matrices 409 combined.charpartitions = {"combined": {name: list(range(combined.nchar))}} 410 for n, m in matrices[1:]: # add all other matrices 411 both = [t for t in combined.taxlabels if t in m.taxlabels] 412 combined_only = [t for t in combined.taxlabels if t not in both] 413 m_only = [t for t in m.taxlabels if t not in both] 414 for t in both: 415 # concatenate sequences and unify gap and missing character symbols 416 combined.matrix[t] += Seq( 417 str(m.matrix[t]) 418 .replace(m.gap, combined.gap) 419 .replace(m.missing, combined.missing), 420 ) 421 # replace date of missing taxa with symbol for missing data 422 for t in combined_only: 423 combined.matrix[t] += Seq(combined.missing * m.nchar) 424 for t in m_only: 425 combined.matrix[t] = Seq(combined.missing * combined.nchar) + Seq( 426 str(m.matrix[t]) 427 .replace(m.gap, combined.gap) 428 .replace(m.missing, combined.missing), 429 ) 430 combined.taxlabels.extend(m_only) # new taxon list 431 for cn, cs in m.charsets.items(): # adjust character sets for new matrix 432 combined.charsets["%s.%s" % (n, cn)] = [x + combined.nchar for x in cs] 433 if m.taxsets: 434 if not combined.taxsets: 435 combined.taxsets = {} 436 # update taxon sets 437 combined.taxsets.update( 438 {"%s.%s" % (n, tn): ts for tn, ts in m.taxsets.items()} 439 ) 440 # update new charpartition 441 combined.charpartitions["combined"][n] = list( 442 range(combined.nchar, combined.nchar + m.nchar) 443 ) 444 # update charlabels 445 if m.charlabels: 446 if not combined.charlabels: 447 combined.charlabels = {} 448 combined.charlabels.update( 449 {combined.nchar + i: label for i, label in m.charlabels.items()} 450 ) 451 combined.nchar += m.nchar # update nchar and ntax 452 combined.ntax += len(m_only) 453 454 # some prefer partitions, some charsets: 455 # make separate charset for ecah initial dataset 456 for c in combined.charpartitions["combined"]: 457 combined.charsets[c] = combined.charpartitions["combined"][c] 458 459 return combined 460 461 462def _kill_comments_and_break_lines(text): 463 r"""Delete []-delimited comments out of a file and break into lines separated by ';' (PRIVATE). 464 465 stripped_text=_kill_comments_and_break_lines(text): 466 Nested and multiline comments are allowed. [ and ] symbols within single 467 or double quotes are ignored, newline ends a quote, all symbols with quotes are 468 treated the same (thus not quoting inside comments like [this character ']' ends a comment]) 469 Special [&...] and [\...] comments remain untouched, if not inside standard comment. 470 Quotes inside special [& and [\ are treated as normal characters, 471 but no nesting inside these special comments allowed (like [& [\ ]]). 472 ';' ist deleted from end of line. 473 474 NOTE: this function is very slow for large files, and obsolete when using C extension cnexus 475 """ 476 if not text: 477 return "" 478 contents = iter(text) 479 newtext = [] 480 newline = [] 481 quotelevel = "" 482 speciallevel = False 483 commlevel = 0 484 # Parse with one character look ahead (for special comments) 485 t2 = next(contents) 486 while True: 487 t = t2 488 try: 489 t2 = next(contents) 490 except StopIteration: 491 t2 = None 492 if t is None: 493 break 494 if t == quotelevel and not (commlevel or speciallevel): 495 # matching quote ends quotation 496 quotelevel = "" 497 elif ( 498 not quotelevel 499 and not (commlevel or speciallevel) 500 and (t == '"' or t == "'") 501 ): 502 # single or double quote starts quotation 503 quotelevel = t 504 elif not quotelevel and t == "[": 505 # opening bracket outside a quote 506 if t2 in SPECIALCOMMENTS and commlevel == 0 and not speciallevel: 507 speciallevel = True 508 else: 509 commlevel += 1 510 elif not quotelevel and t == "]": 511 # closing bracket ioutside a quote 512 if speciallevel: 513 speciallevel = False 514 else: 515 commlevel -= 1 516 if commlevel < 0: 517 raise NexusError("Nexus formatting error: unmatched ]") 518 continue 519 if commlevel == 0: 520 # copy if we're not in comment 521 if t == ";" and not quotelevel: 522 newtext.append("".join(newline)) 523 newline = [] 524 else: 525 newline.append(t) 526 # level of comments should be 0 at the end of the file 527 if newline: 528 newtext.append("\n".join(newline)) 529 if commlevel > 0: 530 raise NexusError("Nexus formatting error: unmatched [") 531 return newtext 532 533 534def _adjust_lines(lines): 535 """Adjust linebreaks to match ';', strip leading/trailing whitespace (PRIVATE). 536 537 list_of_commandlines=_adjust_lines(input_text) 538 Lines are adjusted so that no linebreaks occur within a commandline 539 (except matrix command line) 540 """ 541 formatted_lines = [] 542 for line in lines: 543 # Convert line endings 544 line = line.replace("\r\n", "\n").replace("\r", "\n").strip() 545 if line.lower().startswith("matrix"): 546 formatted_lines.append(line) 547 else: 548 line = line.replace("\n", " ") 549 if line: 550 formatted_lines.append(line) 551 return formatted_lines 552 553 554def _replace_parenthesized_ambigs(seq, rev_ambig_values): 555 """Replace ambigs in xxx(ACG)xxx format by IUPAC ambiguity code (PRIVATE).""" 556 opening = seq.find("(") 557 while opening > -1: 558 closing = seq.find(")") 559 if closing < 0: 560 raise NexusError("Missing closing parenthesis in: " + seq) 561 elif closing < opening: 562 raise NexusError("Missing opening parenthesis in: " + seq) 563 ambig = "".join(sorted(seq[opening + 1 : closing])) 564 ambig_code = rev_ambig_values[ambig.upper()] 565 if ambig != ambig.upper(): 566 ambig_code = ambig_code.lower() 567 seq = seq[:opening] + ambig_code + seq[closing + 1 :] 568 opening = seq.find("(") 569 return seq 570 571 572class Commandline: 573 """Represent a commandline as command and options.""" 574 575 def __init__(self, line, title): 576 """Initialize the class.""" 577 self.options = {} 578 options = [] 579 self.command = None 580 try: 581 # Assume matrix (all other command lines have been stripped of \n) 582 self.command, options = line.strip().split("\n", 1) 583 except ValueError: # Not matrix 584 # self.command,options=line.split(' ',1) # no: could be tab or spaces (translate...) 585 self.command = line.split()[0] 586 options = " ".join(line.split()[1:]) 587 self.command = self.command.strip().lower() 588 if self.command in SPECIAL_COMMANDS: 589 # special command that need newlines and order of options preserved 590 self.options = options.strip() 591 else: 592 if len(options) > 0: 593 try: 594 options = options.replace("=", " = ").split() 595 valued_indices = [ 596 (n - 1, n, n + 1) 597 for n in range(len(options)) 598 if options[n] == "=" and n != 0 and n != len(options) 599 ] 600 indices = [] 601 for sl in valued_indices: 602 indices.extend(sl) 603 token_indices = [n for n in range(len(options)) if n not in indices] 604 for opt in valued_indices: 605 # self.options[options[opt[0]].lower()] = options[opt[2]].lower() 606 self.options[options[opt[0]].lower()] = options[opt[2]] 607 for token in token_indices: 608 self.options[options[token].lower()] = None 609 except ValueError: 610 raise NexusError( 611 "Incorrect formatting in line: %s" % line 612 ) from None 613 614 615class Block: 616 """Represent a NEXUS block with block name and list of commandlines.""" 617 618 def __init__(self, title=None): 619 """Initialize the class.""" 620 self.title = title 621 self.commandlines = [] 622 623 624class Nexus: 625 """Create the Nexus class, main class for the management of Nexus files.""" 626 627 def __init__(self, input=None): 628 """Initialize the class.""" 629 self.ntax = 0 # number of taxa 630 self.nchar = 0 # number of characters 631 self.unaltered_taxlabels = ( 632 [] 633 ) # taxlabels as the appear in the input file (incl. duplicates, etc.) 634 self.taxlabels = [] # labels for taxa, ordered by their id 635 self.charlabels = None # ... and for characters 636 self.statelabels = None # ... and for states 637 self.datatype = "dna" # (standard), dna, rna, nucleotide, protein 638 self.respectcase = False # case sensitivity 639 self.missing = "?" # symbol for missing characters 640 self.gap = "-" # symbol for gap 641 self.symbols = None # set of symbols 642 self.equate = None # set of symbol synonyms 643 self.matchchar = None # matching char for matrix representation 644 self.labels = None # left, right, no 645 self.transpose = False # whether matrix is transposed 646 self.interleave = False # whether matrix is interleaved 647 self.tokens = False # unsupported 648 self.eliminate = None # unsupported 649 self.matrix = None # ... 650 self.unknown_blocks = [] # blocks we don't care about 651 self.taxsets = {} 652 self.charsets = {} 653 self.charpartitions = {} 654 self.taxpartitions = {} 655 self.trees = [] # list of Trees (instances of Tree class) 656 self.translate = None # Dict to translate taxon <-> taxon numbers 657 self.structured = [] # structured input representation 658 self.set = {} # dict of the set command to set various options 659 self.options = {} # dict of the options command in the data block 660 self.codonposset = ( 661 None # name of the charpartition that defines codon positions 662 ) 663 664 # some defaults 665 self.options["gapmode"] = "missing" 666 667 if input: 668 self.read(input) 669 else: 670 self.read(DEFAULTNEXUS) 671 672 def get_original_taxon_order(self): 673 """Included for backwards compatibility (DEPRECATED).""" 674 return self.taxlabels 675 676 def set_original_taxon_order(self, value): 677 """Included for backwards compatibility (DEPRECATED).""" 678 self.taxlabels = value 679 680 original_taxon_order = property(get_original_taxon_order, set_original_taxon_order) 681 682 def read(self, input): 683 """Read and parse NEXUS input (a filename, file-handle, or string).""" 684 # 1. Assume we have the name of a file in the execution dir or a 685 # file-like object. 686 # Note we need to add parsing of the path to dir/filename 687 try: 688 with File.as_handle(input) as fp: 689 file_contents = fp.read() 690 self.filename = getattr(fp, "name", "Unknown_nexus_file") 691 except (TypeError, OSError, AttributeError): 692 # 2. Assume we have a string from a fh.read() 693 if isinstance(input, str): 694 file_contents = input 695 self.filename = "input_string" 696 else: 697 print(input.strip()[:50]) 698 raise NexusError("Unrecognized input: %s ..." % input[:100]) from None 699 file_contents = file_contents.strip() 700 if file_contents.startswith("#NEXUS"): 701 file_contents = file_contents[6:] 702 commandlines = _get_command_lines(file_contents) 703 # get rid of stupid 'NEXUS token - in merged treefiles, this might appear multiple times' 704 for i, cl in enumerate(commandlines): 705 try: 706 if cl[:6].upper() == "#NEXUS": 707 commandlines[i] = cl[6:].strip() 708 except IndexError: 709 pass 710 # now loop through blocks (we parse only data in known blocks, thus ignoring non-block commands 711 nexus_block_gen = self._get_nexus_block(commandlines) 712 while True: 713 try: 714 title, contents = next(nexus_block_gen) 715 except StopIteration: 716 break 717 if title in KNOWN_NEXUS_BLOCKS: 718 self._parse_nexus_block(title, contents) 719 else: 720 self._unknown_nexus_block(title, contents) 721 722 def _get_nexus_block(self, file_contents): 723 """Return a generator for looping through Nexus blocks (PRIVATE).""" 724 inblock = False 725 blocklines = [] 726 while file_contents: 727 cl = file_contents.pop(0) 728 if cl.lower().startswith("begin"): 729 if not inblock: 730 inblock = True 731 title = cl.split()[1].lower() 732 else: 733 raise NexusError("Illegal block nesting in block %s" % title) 734 elif cl.lower().startswith("end"): 735 if inblock: 736 inblock = False 737 yield title, blocklines 738 blocklines = [] 739 else: 740 raise NexusError("Unmatched 'end'.") 741 elif inblock: 742 blocklines.append(cl) 743 744 def _unknown_nexus_block(self, title, contents): 745 block = Block() 746 block.commandlines.append(contents) 747 block.title = title 748 self.unknown_blocks.append(block) 749 750 def _parse_nexus_block(self, title, contents): 751 """Parse a known Nexus Block (PRIVATE).""" 752 # attached the structured block representation 753 self._apply_block_structure(title, contents) 754 # now check for taxa,characters,data blocks. If this stuff is defined more than once 755 # the later occurrences will override the previous ones. 756 block = self.structured[-1] 757 for line in block.commandlines: 758 try: 759 getattr(self, "_" + line.command)(line.options) 760 except AttributeError: 761 raise NexusError("Unknown command: %s " % line.command) from None 762 763 def _title(self, options): 764 pass 765 766 def _link(self, options): 767 pass 768 769 def _dimensions(self, options): 770 if "ntax" in options: 771 self.ntax = eval(options["ntax"]) 772 if "nchar" in options: 773 self.nchar = eval(options["nchar"]) 774 775 def _format(self, options): 776 # print options 777 # we first need to test respectcase, then symbols (which depends on respectcase) 778 # then datatype (which, if standard, depends on symbols and respectcase in order to generate 779 # dicts for ambiguous values 780 if "respectcase" in options: 781 self.respectcase = True 782 # adjust symbols to for respectcase 783 if "symbols" in options: 784 self.symbols = "".join(options["symbols"].split()) 785 if (self.symbols.startswith('"') and self.symbols.endswith('"')) or ( 786 self.symbols.startswith("'") and self.symbols.endswith("'") 787 ): 788 self.symbols = self.symbols[1:-1] 789 if not self.respectcase: 790 self.symbols = list(self.symbols.upper()) 791 # self.symbols = self.symbols.lower() + self.symbols.upper() 792 # self.symbols = list(set(self.symbols)) 793 if "datatype" in options: 794 self.datatype = options["datatype"].lower() 795 796 if self.datatype == "dna" or self.datatype == "nucleotide": 797 self.ambiguous_values = IUPACData.ambiguous_dna_values.copy() 798 self.unambiguous_letters = IUPACData.unambiguous_dna_letters 799 elif self.datatype == "rna": 800 self.ambiguous_values = IUPACData.ambiguous_rna_values.copy() 801 self.unambiguous_letters = IUPACData.unambiguous_rna_letters 802 elif self.datatype == "protein": 803 self.ambiguous_values = { 804 "B": "DN", 805 "Z": "EQ", 806 "X": IUPACData.protein_letters, 807 } 808 # that's how PAUP handles it 809 self.unambiguous_letters = IUPACData.protein_letters + "*" # stop-codon 810 elif self.datatype == "standard": 811 self.ambiguous_values = {} 812 if not self.symbols: 813 # PARSER BUG ## 814 # This error arises when symbols are absent or when 815 # whitespace is located within the SYMBOLS command values. 816 # The Nexus parser quits reading the SYMBOLS line upon 817 # finding a whitespace character. 818 raise NexusError( 819 "Symbols must be defined when using standard datatype. " 820 "Please remove any whitespace (spaces, tabs, etc.) " 821 "between values for symbols as this confuses the Nexus parser." 822 ) 823 824 self.unambiguous_letters = "".join(self.symbols) 825 if not self.respectcase: 826 self.unambiguous_letters += self.unambiguous_letters.lower() 827 else: 828 raise NexusError("Unsupported datatype: " + self.datatype) 829 self.valid_characters = ( 830 "".join(self.ambiguous_values) + self.unambiguous_letters 831 ) 832 if not self.respectcase: 833 self.valid_characters = ( 834 self.valid_characters.lower() + self.valid_characters.upper() 835 ) 836 # we have to sort the reverse ambig coding dict key characters: 837 # to be sure that it's 'ACGT':'N' and not 'GTCA':'N' 838 rev = {v: k for k, v in self.ambiguous_values.items() if k != "X"} 839 self.rev_ambiguous_values = {} 840 for k, v in rev.items(): 841 key = sorted(c for c in k) 842 self.rev_ambiguous_values["".join(key)] = v 843 # overwrite symbols for datatype rna,dna,nucleotide 844 if self.datatype in ["dna", "nucleotide"]: 845 self.symbols = IUPACData.ambiguous_dna_letters 846 if self.missing not in self.ambiguous_values: 847 self.ambiguous_values[self.missing] = ( 848 self.unambiguous_letters + self.gap 849 ) 850 self.ambiguous_values[self.gap] = self.gap 851 elif self.datatype == "rna": 852 self.symbols = IUPACData.ambiguous_rna_letters 853 if self.missing not in self.ambiguous_values: 854 self.ambiguous_values[self.missing] = ( 855 self.unambiguous_letters + self.gap 856 ) 857 self.ambiguous_values[self.gap] = self.gap 858 # elif self.datatype == 'standard': 859 # if not self.symbols: 860 # self.symbols = ['0', '1'] 861 if "missing" in options: 862 self.missing = options["missing"][0] 863 if "gap" in options: 864 self.gap = options["gap"][0] 865 if "equate" in options: 866 self.equate = options["equate"] 867 if "matchchar" in options: 868 self.matchchar = options["matchchar"][0] 869 if "labels" in options: 870 self.labels = options["labels"] 871 if "transpose" in options: 872 # self.transpose = True 873 raise NexusError("TRANSPOSE is not supported!") 874 if "interleave" in options: 875 if options["interleave"] is None or options["interleave"].lower() == "yes": 876 self.interleave = True 877 if "tokens" in options: 878 self.tokens = True 879 if "notokens" in options: 880 self.tokens = False 881 882 def _set(self, options): 883 self.set = options 884 885 def _options(self, options): 886 self.options = options 887 888 def _eliminate(self, options): 889 self.eliminate = options 890 891 def _taxlabels(self, options): 892 """Get taxon labels (PRIVATE). 893 894 As the taxon names are already in the matrix, this is superfluous 895 except for transpose matrices, which are currently unsupported anyway. 896 Thus, we ignore the taxlabels command to make handling of duplicate 897 taxon names easier. 898 """ 899 pass 900 # self.taxlabels = [] 901 # opts = CharBuffer(options) 902 # while True: 903 # taxon = quotestrip(opts.next_word()) 904 # if not taxon: 905 # break 906 # self.taxlabels.append(taxon) 907 908 def _check_taxlabels(self, taxon): 909 """Check for presence of taxon in self.taxlabels (PRIVATE).""" 910 # According to NEXUS standard, underscores shall be treated as spaces..., 911 # so checking for identity is more difficult 912 nextaxa = {t.replace(" ", "_"): t for t in self.taxlabels} 913 nexid = taxon.replace(" ", "_") 914 return nextaxa.get(nexid) 915 916 def _charlabels(self, options): 917 """Get labels for characters (PRIVATE).""" 918 self.charlabels = {} 919 opts = CharBuffer(options) 920 while True: 921 # get id and state 922 w = opts.next_word() 923 if ( 924 w is None 925 ): # McClade saves and reads charlabel-lists with terminal comma?! 926 break 927 identifier = self._resolve(w, set_type=CHARSET) 928 state = quotestrip(opts.next_word()) 929 self.charlabels[identifier] = state 930 # check for comma or end of command 931 c = opts.next_nonwhitespace() 932 if c is None: 933 break 934 elif c != ",": 935 raise NexusError("Missing ',' in line %s." % options) 936 937 def _charstatelabels(self, options): 938 self.charlabels = {} 939 self.statelabels = {} 940 opts = CharBuffer(options) 941 942 # Make sure symbols are defined 943 if not self.symbols: 944 raise NexusError("Symbols must be defined when using character states") 945 946 while True: 947 # get id and character name 948 w = opts.next_word() 949 950 # McClade saves and reads charlabel-lists with terminal comma?! 951 if w is None: 952 break 953 954 identifier = self._resolve(w, set_type=CHARSET) 955 character = quotestrip(opts.next_word()) 956 957 self.charlabels[identifier] = character 958 self.statelabels[identifier] = [] 959 960 # check for comma, slash or end of command 961 c = opts.next_nonwhitespace() 962 963 if c is None: 964 break 965 elif c != ",": 966 # Check if states are defined, otherwise report error 967 if c != "/": 968 raise NexusError("Missing ',' in line %s." % options) 969 970 # Get the first state 971 state = quotestrip(opts.next_word()) 972 973 if state is None: 974 raise NexusError("Missing character state in line %s." % options) 975 976 while True: 977 # Make sure current state does not exceed number of 978 # available symbols 979 if len(self.statelabels[identifier]) > len(self.symbols): 980 raise NexusError( 981 "Character states exceed number of available symbols in line %s." 982 % options 983 ) 984 985 # Add the character state to the statelabels 986 self.statelabels[identifier].append(state) 987 988 # Check for another state or comma to end states (last 989 # character should not have comma at end of states - but 990 # we'll ignore) 991 state = quotestrip(opts.next_word()) 992 993 if state is None: 994 return 995 elif state == ",": 996 break 997 998 def _statelabels(self, options): 999 # self.charlabels = options 1000 # print("Command statelabels is not supported and will be ignored.") 1001 pass 1002 1003 def _matrix(self, options): 1004 """Create a matrix for NEXUS object (PRIVATE).""" 1005 if not self.ntax or not self.nchar: 1006 raise NexusError("Dimensions must be specified before matrix!") 1007 self.matrix = {} 1008 taxcount = 0 1009 first_matrix_block = True 1010 1011 # eliminate empty lines and leading/trailing whitespace 1012 lines = [_.strip() for _ in options.split("\n") if _.strip() != ""] 1013 lineiter = iter(lines) 1014 while True: 1015 try: 1016 line = next(lineiter) 1017 except StopIteration: 1018 if taxcount < self.ntax: 1019 raise NexusError("Not enough taxa in matrix.") from None 1020 elif taxcount > self.ntax: 1021 raise NexusError("Too many taxa in matrix.") from None 1022 else: 1023 break 1024 # count the taxa and check for interleaved matrix 1025 taxcount += 1 1026 if taxcount > self.ntax: 1027 if not self.interleave: 1028 raise NexusError( 1029 "Too many taxa in matrix - should matrix be interleaved?" 1030 ) 1031 else: 1032 taxcount = 1 1033 first_matrix_block = False 1034 # get taxon name and sequence 1035 linechars = CharBuffer(line) 1036 id = quotestrip(linechars.next_word()) 1037 line = linechars.rest().strip() 1038 chars = "" 1039 if self.interleave: 1040 # interleaved matrix 1041 if line: 1042 chars = "".join(line.split()) 1043 else: 1044 chars = "".join(next(lineiter).split()) 1045 else: 1046 # non-interleaved matrix 1047 chars = "".join(line.split()) 1048 while len(chars) < self.nchar: 1049 line = next(lineiter) 1050 chars += "".join(line.split()) 1051 1052 # Reformat sequence for non-standard datatypes 1053 if self.datatype != "standard": 1054 iupac_seq = Seq( 1055 _replace_parenthesized_ambigs(chars, self.rev_ambiguous_values), 1056 ) 1057 # first taxon has the reference sequence if matchhar is used 1058 if taxcount == 1: 1059 refseq = iupac_seq 1060 else: 1061 if self.matchchar: 1062 while True: 1063 p = iupac_seq.find(self.matchchar) 1064 if p == -1: 1065 break 1066 iupac_seq = Seq( 1067 iupac_seq[:p] + refseq[p] + iupac_seq[p + 1 :] 1068 ) 1069 1070 # Check for invalid characters 1071 for i, c in enumerate(iupac_seq): 1072 if ( 1073 c not in self.valid_characters 1074 and c != self.gap 1075 and c != self.missing 1076 ): 1077 raise NexusError( 1078 "Taxon %s: Illegal character %s in sequence %s " 1079 "(check dimensions/interleaving)" % (id, c, iupac_seq) 1080 ) 1081 else: 1082 iupac_seq = StandardData(chars) 1083 1084 # Check for invalid characters 1085 for i, c in enumerate(iupac_seq): 1086 # Go through each coding for each character 1087 for coding in c["d"]: 1088 if coding not in self.valid_characters: 1089 if coding != self.gap and coding != self.missing: 1090 raise NexusError( 1091 "Taxon %s: Illegal character %s in sequence %s " 1092 "(check dimensions/interleaving)" 1093 % (id, coding, iupac_seq) 1094 ) 1095 1096 # add sequence to matrix 1097 if first_matrix_block: 1098 self.unaltered_taxlabels.append(id) 1099 id = _unique_label(list(self.matrix.keys()), id) 1100 self.matrix[id] = iupac_seq 1101 self.taxlabels.append(id) 1102 else: 1103 # taxon names need to be in the same order in each interleaved block 1104 id = _unique_label(self.taxlabels[: taxcount - 1], id) 1105 taxon_present = self._check_taxlabels(id) 1106 if taxon_present: 1107 self.matrix[taxon_present] += iupac_seq 1108 else: 1109 raise NexusError( 1110 "Taxon %s not in first block of interleaved " 1111 "matrix. Check matrix dimensions and interleave." % id 1112 ) 1113 # check all sequences for length according to nchar 1114 for taxon in self.matrix: 1115 if len(self.matrix[taxon]) != self.nchar: 1116 raise NexusError( 1117 "Matrix Nchar %d does not match data length (%d) for taxon %s" 1118 % (self.nchar, len(self.matrix[taxon]), taxon) 1119 ) 1120 # check that taxlabels is identical with matrix.keys. If not, it's a problem 1121 matrixkeys = sorted(self.matrix) 1122 taxlabelssort = sorted(self.taxlabels[:]) 1123 if matrixkeys != taxlabelssort: 1124 raise ValueError( 1125 "ERROR: TAXLABELS must be identical with MATRIX. " 1126 "Please Report this as a bug, and send in data file." 1127 ) 1128 1129 def _translate(self, options): 1130 """Translate a Nexus file (PRIVATE).""" 1131 self.translate = {} 1132 opts = CharBuffer(options) 1133 while True: 1134 try: 1135 # get id and state 1136 identifier = int(opts.next_word()) 1137 label = quotestrip(opts.next_word()) 1138 self.translate[identifier] = label 1139 # check for comma or end of command 1140 c = opts.next_nonwhitespace() 1141 if c is None: 1142 break 1143 elif c != ",": 1144 raise NexusError("Missing ',' in line %s." % options) 1145 except NexusError: 1146 raise 1147 except Exception: # TODO: ValueError? 1148 raise NexusError("Format error in line %s." % options) from None 1149 1150 def _utree(self, options): 1151 """Use 'utree' to denote an unrooted tree (ex: clustalx) (PRIVATE).""" 1152 self._tree(options) 1153 1154 def _tree(self, options): 1155 opts = CharBuffer(options) 1156 if opts.peek_nonwhitespace() == "*": 1157 # a star can be used to make it the default tree in some software packages 1158 dummy = opts.next_nonwhitespace() 1159 name = opts.next_word() 1160 if opts.next_nonwhitespace() != "=": 1161 raise NexusError("Syntax error in tree description: %s" % options[:50]) 1162 rooted = False 1163 weight = 1.0 1164 while opts.peek_nonwhitespace() == "[": 1165 opts.next_nonwhitespace() # discard opening bracket 1166 symbol = next(opts) 1167 if symbol != "&": 1168 raise NexusError( 1169 "Illegal special comment [%s...] in tree description: %s" 1170 % (symbol, options[:50]) 1171 ) 1172 special = next(opts) 1173 value = opts.next_until("]") 1174 next(opts) # discard closing bracket 1175 if special == "R": 1176 rooted = True 1177 elif special == "U": 1178 rooted = False 1179 elif special == "W": 1180 weight = float(value) 1181 tree = Tree(name=name, weight=weight, rooted=rooted, tree=opts.rest().strip()) 1182 # if there's an active translation table, translate 1183 if self.translate: 1184 for n in tree.get_terminals(): 1185 try: 1186 tree.node(n).data.taxon = safename( 1187 self.translate[int(tree.node(n).data.taxon)] 1188 ) 1189 except (ValueError, KeyError): 1190 raise NexusError( 1191 "Unable to substitute %s using 'translate' data." 1192 % tree.node(n).data.taxon 1193 ) from None 1194 self.trees.append(tree) 1195 1196 def _apply_block_structure(self, title, lines): 1197 """Apply Block structure to the NEXUS file (PRIVATE).""" 1198 block = Block("") 1199 block.title = title 1200 for line in lines: 1201 block.commandlines.append(Commandline(line, title)) 1202 self.structured.append(block) 1203 1204 def _taxset(self, options): 1205 """Create unique taxset (PRIVATE).""" 1206 name, taxa = self._get_indices(options, set_type=TAXSET) 1207 self.taxsets[name] = _make_unique(taxa) 1208 1209 def _charset(self, options): 1210 """Create unique character set (PRIVATE).""" 1211 name, sites = self._get_indices(options, set_type=CHARSET) 1212 self.charsets[name] = _make_unique(sites) 1213 1214 def _taxpartition(self, options): 1215 """Collect taxpartition from a NEXUS file (PRIVATE).""" 1216 taxpartition = {} 1217 quotelevel = False 1218 opts = CharBuffer(options) 1219 name = self._name_n_vector(opts) 1220 if not name: 1221 raise NexusError("Formatting error in taxpartition: %s " % options) 1222 # now collect thesubbpartitions and parse them 1223 # subpartitons separated by commas - which unfortunately could be part of a quoted identifier... 1224 # this is rather unelegant, but we have to avoid double-parsing and potential change of special nexus-words 1225 sub = "" 1226 while True: 1227 w = next(opts) 1228 if w is None or (w == "," and not quotelevel): 1229 subname, subindices = self._get_indices( 1230 sub, set_type=TAXSET, separator=":" 1231 ) 1232 taxpartition[subname] = _make_unique(subindices) 1233 sub = "" 1234 if w is None: 1235 break 1236 else: 1237 if w == "'": 1238 quotelevel = not quotelevel 1239 sub += w 1240 self.taxpartitions[name] = taxpartition 1241 1242 def _codonposset(self, options): 1243 """Read codon positions from a codons block as written from McClade (PRIVATE). 1244 1245 Here codonposset is just a fancy name for a character partition with 1246 the name CodonPositions and the partitions N,1,2,3 1247 """ 1248 prev_partitions = list(self.charpartitions.keys()) 1249 self._charpartition(options) 1250 # mcclade calls it CodonPositions, but you never know... 1251 codonname = [n for n in self.charpartitions if n not in prev_partitions] 1252 if codonname == [] or len(codonname) > 1: 1253 raise NexusError("Formatting Error in codonposset: %s " % options) 1254 else: 1255 self.codonposset = codonname[0] 1256 1257 def _codeset(self, options): 1258 pass 1259 1260 def _charpartition(self, options): 1261 """Collect character partition from NEXUS file (PRIVATE).""" 1262 charpartition = {} 1263 quotelevel = False 1264 opts = CharBuffer(options) 1265 name = self._name_n_vector(opts) 1266 if not name: 1267 raise NexusError("Formatting error in charpartition: %s " % options) 1268 # now collect the subpartitions and parse them 1269 # subpartitions separated by commas - which unfortunately could be part 1270 # of a quoted identifier... 1271 sub = "" 1272 while True: 1273 w = next(opts) 1274 if w is None or (w == "," and not quotelevel): 1275 subname, subindices = self._get_indices( 1276 sub, set_type=CHARSET, separator=":" 1277 ) 1278 charpartition[subname] = _make_unique(subindices) 1279 sub = "" 1280 if w is None: 1281 break 1282 else: 1283 if w == "'": 1284 quotelevel = not quotelevel 1285 sub += w 1286 self.charpartitions[name] = charpartition 1287 1288 def _get_indices(self, options, set_type=CHARSET, separator="="): 1289 r"""Parse the taxset/charset specification (PRIVATE). 1290 1291 e.g. '1 2 3 - 5 dog cat 10 - 20 \\ 3' 1292 --> [0,1,2,3,4,'dog','cat',9,12,15,18] 1293 """ 1294 opts = CharBuffer(options) 1295 name = self._name_n_vector(opts, separator=separator) 1296 indices = self._parse_list(opts, set_type=set_type) 1297 if indices is None: 1298 raise NexusError("Formatting error in line: %s " % options) 1299 return name, indices 1300 1301 def _name_n_vector(self, opts, separator="="): 1302 """Extract name and check that it's not in vector format (PRIVATE).""" 1303 rest = opts.rest() 1304 name = opts.next_word() 1305 # we ignore * before names 1306 if name == "*": 1307 name = opts.next_word() 1308 if not name: 1309 raise NexusError("Formatting error in line: %s " % rest) 1310 name = quotestrip(name) 1311 if opts.peek_nonwhitespace == "(": 1312 open = opts.next_nonwhitespace() 1313 qualifier = open.next_word() 1314 close = opts.next_nonwhitespace() 1315 if qualifier.lower() == "vector": 1316 raise NexusError("Unsupported VECTOR format in line %s" % (opts)) 1317 elif qualifier.lower() != "standard": 1318 raise NexusError("Unknown qualifier %s in line %s" % (qualifier, opts)) 1319 if opts.next_nonwhitespace() != separator: 1320 raise NexusError("Formatting error in line: %s " % rest) 1321 return name 1322 1323 def _parse_list(self, options_buffer, set_type): 1324 r"""Parse a NEXUS list (PRIVATE). 1325 1326 e.g. [1, 2, 4-8\\2, dog, cat] --> [1,2,4,6,8,17,21], 1327 (assuming dog is taxon no. 17 and cat is taxon no. 21). 1328 """ 1329 plain_list = [] 1330 if options_buffer.peek_nonwhitespace(): 1331 try: 1332 # capture all possible exceptions and treat them as formatting 1333 # errors, if they are not NexusError 1334 while True: 1335 identifier = options_buffer.next_word() # next list element 1336 if not identifier: # end of list? 1337 break 1338 start = self._resolve(identifier, set_type=set_type) 1339 if options_buffer.peek_nonwhitespace() == "-": # followd by - 1340 end = start 1341 step = 1 1342 # get hyphen and end of range 1343 hyphen = options_buffer.next_nonwhitespace() 1344 end = self._resolve( 1345 options_buffer.next_word(), set_type=set_type 1346 ) 1347 if set_type == CHARSET: 1348 if ( 1349 options_buffer.peek_nonwhitespace() == "\\" 1350 ): # followd by \ 1351 backslash = options_buffer.next_nonwhitespace() 1352 step = int( 1353 options_buffer.next_word() 1354 ) # get backslash and step 1355 plain_list.extend(range(start, end + 1, step)) 1356 else: 1357 if isinstance(start, list) or isinstance(end, list): 1358 raise NexusError( 1359 "Name if character sets not allowed in range definition: %s" 1360 % identifier 1361 ) 1362 start = self.taxlabels.index(start) 1363 end = self.taxlabels.index(end) 1364 taxrange = self.taxlabels[start : end + 1] 1365 plain_list.extend(taxrange) 1366 else: 1367 if isinstance(start, list): 1368 # start was the name of charset or taxset 1369 plain_list.extend(start) 1370 else: 1371 # start was an ordinary identifier 1372 plain_list.append(start) 1373 except NexusError: 1374 raise 1375 except Exception: # FIXME - this seems unwise 1376 return None 1377 return plain_list 1378 1379 def _resolve(self, identifier, set_type=None): 1380 """Translate identifier in list into character/taxon index (PRIVATE). 1381 1382 Characters (which are referred to by their index in Nexus.py): 1383 Plain numbers are returned minus 1 (Nexus indices to python indices) 1384 Text identifiers are translated into their indices (if plain character identifiers), 1385 the first hit in charlabels is returned (charlabels don't need to be unique) 1386 or the range of indices is returned (if names of character sets). 1387 Taxa (which are referred to by their unique name in Nexus.py): 1388 Plain numbers are translated in their taxon name, underscores and spaces are considered equal. 1389 Names are returned unchanged (if plain taxon identifiers), or the names in 1390 the corresponding taxon set is returned. 1391 1392 """ 1393 identifier = quotestrip(identifier) 1394 if not set_type: 1395 raise NexusError("INTERNAL ERROR: Need type to resolve identifier.") 1396 if set_type == CHARSET: 1397 try: 1398 n = int(identifier) 1399 except ValueError: 1400 if self.charlabels and identifier in self.charlabels.values(): 1401 for k in self.charlabels: 1402 if self.charlabels[k] == identifier: 1403 return k 1404 elif self.charsets and identifier in self.charsets: 1405 return self.charsets[identifier] 1406 else: 1407 raise NexusError( 1408 "Unknown character identifier: %s" % identifier 1409 ) from None 1410 else: 1411 if n <= self.nchar: 1412 return n - 1 1413 else: 1414 raise NexusError( 1415 "Illegal character identifier: %d>nchar (=%d)." 1416 % (identifier, self.nchar) 1417 ) 1418 elif set_type == TAXSET: 1419 try: 1420 n = int(identifier) 1421 except ValueError: 1422 taxlabels_id = self._check_taxlabels(identifier) 1423 if taxlabels_id: 1424 return taxlabels_id 1425 elif self.taxsets and identifier in self.taxsets: 1426 return self.taxsets[identifier] 1427 else: 1428 raise NexusError( 1429 "Unknown taxon identifier: %s" % identifier 1430 ) from None 1431 else: 1432 if n > 0 and n <= self.ntax: 1433 return self.taxlabels[n - 1] 1434 else: 1435 raise NexusError( 1436 "Illegal taxon identifier: %d>ntax (=%d)." 1437 % (identifier, self.ntax) 1438 ) 1439 else: 1440 raise NexusError("Unknown set specification: %s." % set_type) 1441 1442 def _stateset(self, options): 1443 # Not implemented 1444 pass 1445 1446 def _changeset(self, options): 1447 # Not implemented 1448 pass 1449 1450 def _treeset(self, options): 1451 # Not implemented 1452 pass 1453 1454 def _treepartition(self, options): 1455 # Not implemented 1456 pass 1457 1458 def write_nexus_data_partitions( 1459 self, 1460 matrix=None, 1461 filename=None, 1462 blocksize=None, 1463 interleave=False, 1464 exclude=(), 1465 delete=(), 1466 charpartition=None, 1467 comment="", 1468 mrbayes=False, 1469 ): 1470 """Write a nexus file for each partition in charpartition. 1471 1472 Only non-excluded characters and non-deleted taxa are included, 1473 just the data block is written. 1474 """ 1475 if not matrix: 1476 matrix = self.matrix 1477 if not matrix: 1478 return 1479 if not filename: 1480 filename = self.filename 1481 if charpartition: 1482 pfilenames = {} 1483 for p in charpartition: 1484 total_exclude = list(exclude) 1485 total_exclude.extend( 1486 c for c in range(self.nchar) if c not in charpartition[p] 1487 ) 1488 total_exclude = _make_unique(total_exclude) 1489 pcomment = comment + "\nPartition: " + p + "\n" 1490 dot = filename.rfind(".") 1491 if dot > 0: 1492 pfilename = filename[:dot] + "_" + p + ".data" 1493 else: 1494 pfilename = filename + "_" + p 1495 pfilenames[p] = pfilename 1496 self.write_nexus_data( 1497 filename=pfilename, 1498 matrix=matrix, 1499 blocksize=blocksize, 1500 interleave=interleave, 1501 exclude=total_exclude, 1502 delete=delete, 1503 comment=pcomment, 1504 append_sets=False, 1505 mrbayes=mrbayes, 1506 ) 1507 return pfilenames 1508 else: 1509 fn = self.filename + ".data" 1510 self.write_nexus_data( 1511 filename=fn, 1512 matrix=matrix, 1513 blocksize=blocksize, 1514 interleave=interleave, 1515 exclude=exclude, 1516 delete=delete, 1517 comment=comment, 1518 append_sets=False, 1519 mrbayes=mrbayes, 1520 ) 1521 return fn 1522 1523 def write_nexus_data( 1524 self, 1525 filename=None, 1526 matrix=None, 1527 exclude=(), 1528 delete=(), 1529 blocksize=None, 1530 interleave=False, 1531 interleave_by_partition=False, 1532 comment=None, 1533 omit_NEXUS=False, 1534 append_sets=True, 1535 mrbayes=False, 1536 codons_block=True, 1537 ): 1538 """Write a nexus file with data and sets block to a file or handle. 1539 1540 Character sets and partitions are appended by default, and are 1541 adjusted according to excluded characters (i.e. character sets 1542 still point to the same sites (not necessarily same positions), 1543 without including the deleted characters. 1544 1545 - filename - Either a filename as a string (which will be opened, 1546 written to and closed), or a handle object (which will 1547 be written to but NOT closed). 1548 - interleave_by_partition - Optional name of partition (string) 1549 - omit_NEXUS - Boolean. If true, the '#NEXUS' line normally at the 1550 start of the file is omitted. 1551 1552 Returns the filename/handle used to write the data. 1553 """ 1554 if not matrix: 1555 matrix = self.matrix 1556 if not matrix: 1557 return 1558 if not filename: 1559 filename = self.filename 1560 if [t for t in delete if not self._check_taxlabels(t)]: 1561 raise NexusError( 1562 "Unknown taxa: %s" 1563 % ", ".join(set(delete).difference(set(self.taxlabels))) 1564 ) 1565 if interleave_by_partition: 1566 if interleave_by_partition not in self.charpartitions: 1567 raise NexusError("Unknown partition: %r" % interleave_by_partition) 1568 else: 1569 partition = self.charpartitions[interleave_by_partition] 1570 # we need to sort the partition names by starting position 1571 # before we exclude characters 1572 names = _sort_keys_by_values(partition) 1573 newpartition = {} 1574 for p in partition: 1575 newpartition[p] = [c for c in partition[p] if c not in exclude] 1576 # how many taxa and how many characters are left? 1577 undelete = [ 1578 taxon for taxon in self.taxlabels if taxon in matrix and taxon not in delete 1579 ] 1580 cropped_matrix = _seqmatrix2strmatrix( 1581 self.crop_matrix(matrix, exclude=exclude, delete=delete) 1582 ) 1583 ntax_adjusted = len(undelete) 1584 nchar_adjusted = len(cropped_matrix[undelete[0]]) 1585 if not undelete or (undelete and undelete[0] == ""): 1586 return 1587 1588 with File.as_handle(filename, mode="w") as fh: 1589 if not omit_NEXUS: 1590 fh.write("#NEXUS\n") 1591 if comment: 1592 fh.write("[" + comment + "]\n") 1593 fh.write("begin data;\n") 1594 fh.write("dimensions ntax=%d nchar=%d;\n" % (ntax_adjusted, nchar_adjusted)) 1595 fh.write("format datatype=" + self.datatype) 1596 if self.respectcase: 1597 fh.write(" respectcase") 1598 if self.missing: 1599 fh.write(" missing=" + self.missing) 1600 if self.gap: 1601 fh.write(" gap=" + self.gap) 1602 if self.matchchar: 1603 fh.write(" matchchar=" + self.matchchar) 1604 if self.labels: 1605 fh.write(" labels=" + self.labels) 1606 if self.equate: 1607 fh.write(" equate=" + self.equate) 1608 if interleave or interleave_by_partition: 1609 fh.write(" interleave") 1610 fh.write(";\n") 1611 # if self.taxlabels: 1612 # fh.write('taxlabels '+' '.join(self.taxlabels)+';\n') 1613 if self.charlabels: 1614 newcharlabels = self._adjust_charlabels(exclude=exclude) 1615 clkeys = sorted(newcharlabels) 1616 fh.write( 1617 "charlabels " 1618 + ", ".join( 1619 "%s %s" % (k + 1, safename(newcharlabels[k])) for k in clkeys 1620 ) 1621 + ";\n" 1622 ) 1623 fh.write("matrix\n") 1624 if not blocksize: 1625 if interleave: 1626 blocksize = 70 1627 else: 1628 blocksize = self.nchar 1629 # delete deleted taxa and ecxclude excluded characters... 1630 namelength = max(len(safename(t, mrbayes=mrbayes)) for t in undelete) 1631 if interleave_by_partition: 1632 # interleave by partitions, but adjust partitions with regard 1633 # to excluded characters 1634 seek = 0 1635 for p in names: 1636 fh.write("[%s: %s]\n" % (interleave_by_partition, p)) 1637 if len(newpartition[p]) > 0: 1638 for taxon in undelete: 1639 fh.write( 1640 safename(taxon, mrbayes=mrbayes).ljust(namelength + 1) 1641 ) 1642 fh.write( 1643 cropped_matrix[taxon][ 1644 seek : seek + len(newpartition[p]) 1645 ] 1646 + "\n" 1647 ) 1648 fh.write("\n") 1649 else: 1650 fh.write("[empty]\n\n") 1651 seek += len(newpartition[p]) 1652 elif interleave: 1653 for seek in range(0, nchar_adjusted, blocksize): 1654 for taxon in undelete: 1655 fh.write(safename(taxon, mrbayes=mrbayes).ljust(namelength + 1)) 1656 fh.write(cropped_matrix[taxon][seek : seek + blocksize] + "\n") 1657 fh.write("\n") 1658 else: 1659 for taxon in undelete: 1660 if blocksize < nchar_adjusted: 1661 fh.write(safename(taxon, mrbayes=mrbayes) + "\n") 1662 else: 1663 fh.write(safename(taxon, mrbayes=mrbayes).ljust(namelength + 1)) 1664 taxon_seq = cropped_matrix[taxon] 1665 for seek in range(0, nchar_adjusted, blocksize): 1666 fh.write(taxon_seq[seek : seek + blocksize] + "\n") 1667 del taxon_seq 1668 fh.write(";\nend;\n") 1669 if append_sets: 1670 if codons_block: 1671 fh.write( 1672 self.append_sets( 1673 exclude=exclude, 1674 delete=delete, 1675 mrbayes=mrbayes, 1676 include_codons=False, 1677 ) 1678 ) 1679 fh.write( 1680 self.append_sets( 1681 exclude=exclude, 1682 delete=delete, 1683 mrbayes=mrbayes, 1684 codons_only=True, 1685 ) 1686 ) 1687 else: 1688 fh.write( 1689 self.append_sets( 1690 exclude=exclude, delete=delete, mrbayes=mrbayes 1691 ) 1692 ) 1693 return filename 1694 1695 def append_sets( 1696 self, 1697 exclude=(), 1698 delete=(), 1699 mrbayes=False, 1700 include_codons=True, 1701 codons_only=False, 1702 ): 1703 """Return a sets block.""" 1704 if not self.charsets and not self.taxsets and not self.charpartitions: 1705 return "" 1706 if codons_only: 1707 setsb = ["\nbegin codons"] 1708 else: 1709 setsb = ["\nbegin sets"] 1710 # - now if characters have been excluded, the character sets need to be adjusted, 1711 # so that they still point to the right character positions 1712 # calculate a list of offsets: for each deleted character, the following character position 1713 # in the new file will have an additional offset of -1 1714 offset = 0 1715 offlist = [] 1716 for c in range(self.nchar): 1717 if c in exclude: 1718 offset += 1 1719 offlist.append( 1720 -1 1721 ) # dummy value as these character positions are excluded 1722 else: 1723 offlist.append(c - offset) 1724 # now adjust each of the character sets 1725 if not codons_only: 1726 for n, ns in self.charsets.items(): 1727 cset = [offlist[c] for c in ns if c not in exclude] 1728 if cset: 1729 setsb.append( 1730 "charset %s = %s" % (safename(n), _compact4nexus(cset)) 1731 ) 1732 for n, s in self.taxsets.items(): 1733 tset = [safename(t, mrbayes=mrbayes) for t in s if t not in delete] 1734 if tset: 1735 setsb.append("taxset %s = %s" % (safename(n), " ".join(tset))) 1736 for n, p in self.charpartitions.items(): 1737 if not include_codons and n == CODONPOSITIONS: 1738 continue 1739 elif codons_only and n != CODONPOSITIONS: 1740 continue 1741 # as characters have been excluded, the partitions must be adjusted 1742 # if a partition is empty, it will be omitted from the charpartition command 1743 # (although paup allows charpartition part=t1:,t2:,t3:1-100) 1744 names = _sort_keys_by_values(p) 1745 newpartition = {} 1746 for sn in names: 1747 nsp = [offlist[c] for c in p[sn] if c not in exclude] 1748 if nsp: 1749 newpartition[sn] = nsp 1750 if newpartition: 1751 if include_codons and n == CODONPOSITIONS: 1752 command = "codonposset" 1753 else: 1754 command = "charpartition" 1755 setsb.append( 1756 "%s %s = %s" 1757 % ( 1758 command, 1759 safename(n), 1760 ", ".join( 1761 "%s: %s" % (sn, _compact4nexus(newpartition[sn])) 1762 for sn in names 1763 if sn in newpartition 1764 ), 1765 ) 1766 ) 1767 # now write charpartititions, much easier than charpartitions 1768 for n, p in self.taxpartitions.items(): 1769 names = _sort_keys_by_values(p) 1770 newpartition = {} 1771 for sn in names: 1772 nsp = [t for t in p[sn] if t not in delete] 1773 if nsp: 1774 newpartition[sn] = nsp 1775 if newpartition: 1776 setsb.append( 1777 "taxpartition %s = %s" 1778 % ( 1779 safename(n), 1780 ", ".join( 1781 "%s: %s" 1782 % ( 1783 safename(sn), 1784 " ".join(safename(x) for x in newpartition[sn]), 1785 ) 1786 for sn in names 1787 if sn in newpartition 1788 ), 1789 ) 1790 ) 1791 # add 'end' and return everything 1792 setsb.append("end;\n") 1793 if len(setsb) == 2: # begin and end only 1794 return "" 1795 else: 1796 return ";\n".join(setsb) 1797 1798 def export_fasta(self, filename=None, width=70): 1799 """Write matrix into a fasta file.""" 1800 if not filename: 1801 if "." in self.filename and self.filename.split(".")[-1].lower() in [ 1802 "paup", 1803 "nexus", 1804 "nex", 1805 "dat", 1806 ]: 1807 filename = ".".join(self.filename.split(".")[:-1]) + ".fas" 1808 else: 1809 filename = self.filename + ".fas" 1810 with open(filename, "w") as fh: 1811 for taxon in self.taxlabels: 1812 fh.write(">" + safename(taxon) + "\n") 1813 for i in range(0, len(str(self.matrix[taxon])), width): 1814 fh.write(str(self.matrix[taxon])[i : i + width] + "\n") 1815 return filename 1816 1817 def export_phylip(self, filename=None): 1818 """Write matrix into a PHYLIP file. 1819 1820 Note that this writes a relaxed PHYLIP format file, where the names 1821 are not truncated, nor checked for invalid characters. 1822 """ 1823 if not filename: 1824 if "." in self.filename and self.filename.split(".")[-1].lower() in [ 1825 "paup", 1826 "nexus", 1827 "nex", 1828 "dat", 1829 ]: 1830 filename = ".".join(self.filename.split(".")[:-1]) + ".phy" 1831 else: 1832 filename = self.filename + ".phy" 1833 with open(filename, "w") as fh: 1834 fh.write("%d %d\n" % (self.ntax, self.nchar)) 1835 for taxon in self.taxlabels: 1836 fh.write("%s %s\n" % (safename(taxon), str(self.matrix[taxon]))) 1837 return filename 1838 1839 def constant(self, matrix=None, delete=(), exclude=()): 1840 """Return a list with all constant characters.""" 1841 if not matrix: 1842 matrix = self.matrix 1843 undelete = [t for t in self.taxlabels if t in matrix and t not in delete] 1844 if not undelete: 1845 return None 1846 elif len(undelete) == 1: 1847 return [x for x in range(len(matrix[undelete[0]])) if x not in exclude] 1848 # get the first sequence and expand all ambiguous values 1849 constant = [ 1850 (x, self.ambiguous_values.get(n.upper(), n.upper())) 1851 for x, n in enumerate(str(matrix[undelete[0]])) 1852 if x not in exclude 1853 ] 1854 1855 for taxon in undelete[1:]: 1856 newconstant = [] 1857 for site in constant: 1858 # print("%d (paup=%d)" % (site[0],site[0]+1), end="") 1859 seqsite = matrix[taxon][site[0]].upper() 1860 # print(seqsite,"checked against",site[1],"\t", end="") 1861 if ( 1862 seqsite == self.missing 1863 or ( 1864 seqsite == self.gap 1865 and self.options["gapmode"].lower() == "missing" 1866 ) 1867 or seqsite == site[1] 1868 ): 1869 # missing or same as before -> ok 1870 newconstant.append(site) 1871 elif ( 1872 seqsite in site[1] 1873 or site[1] == self.missing 1874 or ( 1875 self.options["gapmode"].lower() == "missing" 1876 and site[1] == self.gap 1877 ) 1878 ): 1879 # subset of an ambig or only missing in previous -> take subset 1880 newconstant.append( 1881 (site[0], self.ambiguous_values.get(seqsite, seqsite)) 1882 ) 1883 elif seqsite in self.ambiguous_values: 1884 # is it an ambig: check the intersection with prev. values 1885 intersect = set(self.ambiguous_values[seqsite]).intersection( 1886 set(site[1]) 1887 ) 1888 if intersect: 1889 newconstant.append((site[0], "".join(intersect))) 1890 # print("ok") 1891 # else: 1892 # print("failed") 1893 # else: 1894 # print("failed") 1895 constant = newconstant 1896 cpos = [s[0] for s in constant] 1897 return cpos 1898 1899 def cstatus(self, site, delete=(), narrow=True): 1900 """Summarize character. 1901 1902 narrow=True: paup-mode (a c ? --> ac; ? ? ? --> ?) 1903 narrow=false: (a c ? --> a c g t -; ? ? ? --> a c g t -) 1904 """ 1905 undelete = [t for t in self.taxlabels if t not in delete] 1906 if not undelete: 1907 return None 1908 cstatus = [] 1909 for t in undelete: 1910 c = self.matrix[t][site].upper() 1911 if self.options.get("gapmode") == "missing" and c == self.gap: 1912 c = self.missing 1913 if narrow and c == self.missing: 1914 if c not in cstatus: 1915 cstatus.append(c) 1916 else: 1917 cstatus.extend(b for b in self.ambiguous_values[c] if b not in cstatus) 1918 if self.missing in cstatus and narrow and len(cstatus) > 1: 1919 cstatus = [_ for _ in cstatus if _ != self.missing] 1920 cstatus.sort() 1921 return cstatus 1922 1923 def weighted_stepmatrix(self, name="your_name_here", exclude=(), delete=()): 1924 """Calculate a stepmatrix for weighted parsimony. 1925 1926 See Wheeler (1990), Cladistics 6:269-275 and 1927 Felsenstein (1981), Biol. J. Linn. Soc. 16:183-196 1928 """ 1929 m = StepMatrix(self.unambiguous_letters, self.gap) 1930 for site in [s for s in range(self.nchar) if s not in exclude]: 1931 cstatus = self.cstatus(site, delete) 1932 for i, b1 in enumerate(cstatus[:-1]): 1933 for b2 in cstatus[i + 1 :]: 1934 m.add(b1.upper(), b2.upper(), 1) 1935 return m.transformation().weighting().smprint(name=name) 1936 1937 def crop_matrix(self, matrix=None, delete=(), exclude=()): 1938 """Return a matrix without deleted taxa and excluded characters.""" 1939 if not matrix: 1940 matrix = self.matrix 1941 if [t for t in delete if not self._check_taxlabels(t)]: 1942 raise NexusError( 1943 "Unknown taxa: %s" % ", ".join(set(delete).difference(self.taxlabels)) 1944 ) 1945 if exclude != []: 1946 undelete = [t for t in self.taxlabels if t in matrix and t not in delete] 1947 if not undelete: 1948 return {} 1949 m = [str(matrix[k]) for k in undelete] 1950 sitesm = [s for i, s in enumerate(zip(*m)) if i not in exclude] 1951 if sitesm == []: 1952 return {t: Seq("") for t in undelete} 1953 else: 1954 m = [Seq(s) for s in ("".join(x) for x in zip(*sitesm))] 1955 return dict(zip(undelete, m)) 1956 else: 1957 return { 1958 t: matrix[t] for t in self.taxlabels if t in matrix and t not in delete 1959 } 1960 1961 def bootstrap(self, matrix=None, delete=(), exclude=()): 1962 """Return a bootstrapped matrix.""" 1963 if not matrix: 1964 matrix = self.matrix 1965 seqobjects = isinstance( 1966 matrix[list(matrix.keys())[0]], Seq 1967 ) # remember if Seq objects 1968 cm = self.crop_matrix(delete=delete, exclude=exclude) # crop data out 1969 if not cm: # everything deleted? 1970 return {} 1971 elif not cm[list(cm.keys())[0]]: # everything excluded? 1972 return cm 1973 undelete = [t for t in self.taxlabels if t in cm] 1974 if seqobjects: 1975 sitesm = list(zip(*[str(cm[t]) for t in undelete])) 1976 else: 1977 sitesm = list(zip(*[cm[t] for t in undelete])) 1978 bootstrapsitesm = [ 1979 sitesm[random.randint(0, len(sitesm) - 1)] for _ in range(len(sitesm)) 1980 ] 1981 bootstrapseqs = ["".join(x) for x in zip(*bootstrapsitesm)] 1982 if seqobjects: 1983 bootstrapseqs = [Seq(s) for s in bootstrapseqs] 1984 return dict(zip(undelete, bootstrapseqs)) 1985 1986 def add_sequence(self, name, sequence): 1987 """Add a sequence (string) to the matrix.""" 1988 if not name: 1989 raise NexusError("New sequence must have a name") 1990 1991 diff = self.nchar - len(sequence) 1992 if diff < 0: 1993 self.insert_gap(self.nchar, -diff) 1994 elif diff > 0: 1995 sequence += self.missing * diff 1996 1997 if name in self.taxlabels: 1998 unique_name = _unique_label(self.taxlabels, name) 1999 # print("WARNING: Sequence name %s is already present. Sequence was added as %s." % (name,unique_name)) 2000 else: 2001 unique_name = name 2002 2003 if unique_name in self.matrix: 2004 raise ValueError( 2005 "ERROR. There is a discrepancy between taxlabels " 2006 "and matrix keys. Report this as a bug." 2007 ) 2008 2009 self.matrix[unique_name] = Seq(sequence) 2010 self.ntax += 1 2011 self.taxlabels.append(unique_name) 2012 self.unaltered_taxlabels.append(name) 2013 2014 def insert_gap(self, pos, n=1, leftgreedy=False): 2015 """Add a gap into the matrix and adjust charsets and partitions. 2016 2017 pos=0: first position 2018 pos=nchar: last position 2019 """ 2020 2021 def _adjust(set, x, d, leftgreedy=False): 2022 """Adjust character sets if gaps are inserted (PRIVATE). 2023 2024 Takes care of new gaps within a coherent character set. 2025 """ 2026 # if 3 gaps are inserted at pos. 9 in a set that looks like 1 2 3 8 9 10 11 13 14 15 2027 # then the adjusted set will be 1 2 3 8 9 10 11 12 13 14 15 16 17 18 2028 # but inserting into position 8 it will stay like 1 2 3 11 12 13 14 15 16 17 18 2029 set.sort() 2030 addpos = 0 2031 for i, c in enumerate(set): 2032 if c >= x: 2033 set[i] = c + d 2034 # if we add gaps within a group of characters, we want the gap position included in this group 2035 if c == x: 2036 if leftgreedy or (i > 0 and set[i - 1] == c - 1): 2037 addpos = i 2038 if addpos > 0: 2039 set[addpos:addpos] = list(range(x, x + d)) 2040 return set 2041 2042 if pos < 0 or pos > self.nchar: 2043 raise NexusError("Illegal gap position: %d" % pos) 2044 if n == 0: 2045 return 2046 sitesm = list(zip(*[str(self.matrix[t]) for t in self.taxlabels])) 2047 sitesm[pos:pos] = [["-"] * len(self.taxlabels)] * n 2048 mapped = ["".join(x) for x in zip(*sitesm)] 2049 listed = [(taxon, Seq(mapped[i])) for i, taxon in enumerate(self.taxlabels)] 2050 self.matrix = dict(listed) 2051 self.nchar += n 2052 # now adjust character sets 2053 for i, s in self.charsets.items(): 2054 self.charsets[i] = _adjust(s, pos, n, leftgreedy=leftgreedy) 2055 for p in self.charpartitions: 2056 for sp, s in self.charpartitions[p].items(): 2057 self.charpartitions[p][sp] = _adjust(s, pos, n, leftgreedy=leftgreedy) 2058 # now adjust character state labels 2059 self.charlabels = self._adjust_charlabels(insert=[pos] * n) 2060 return self.charlabels 2061 2062 def _adjust_charlabels(self, exclude=None, insert=None): 2063 """Return adjusted indices of self.charlabels if characters are excluded or inserted (PRIVATE).""" 2064 if exclude and insert: 2065 raise NexusError("Can't exclude and insert at the same time") 2066 if not self.charlabels: 2067 return None 2068 labels = sorted(self.charlabels) 2069 newcharlabels = {} 2070 if exclude: 2071 exclude.sort() 2072 exclude.append(sys.maxsize) 2073 excount = 0 2074 for c in labels: 2075 if c not in exclude: 2076 while c > exclude[excount]: 2077 excount += 1 2078 newcharlabels[c - excount] = self.charlabels[c] 2079 elif insert: 2080 insert.sort() 2081 insert.append(sys.maxsize) 2082 icount = 0 2083 for c in labels: 2084 while c >= insert[icount]: 2085 icount += 1 2086 newcharlabels[c + icount] = self.charlabels[c] 2087 else: 2088 return self.charlabels 2089 return newcharlabels 2090 2091 def invert(self, charlist): 2092 """Return all character indices that are not in charlist.""" 2093 return [c for c in range(self.nchar) if c not in charlist] 2094 2095 def gaponly(self, include_missing=False): 2096 """Return gap-only sites.""" 2097 gap = set(self.gap) 2098 if include_missing: 2099 gap.add(self.missing) 2100 sitesm = zip(*[str(self.matrix[t]) for t in self.taxlabels]) 2101 return [i for i, site in enumerate(sitesm) if set(site).issubset(gap)] 2102 2103 def terminal_gap_to_missing(self, missing=None, skip_n=True): 2104 """Replace all terminal gaps with missing character. 2105 2106 Mixtures like ???------??------- are properly resolved. 2107 """ 2108 if not missing: 2109 missing = self.missing 2110 replace = [self.missing, self.gap] 2111 if not skip_n: 2112 replace.extend(["n", "N"]) 2113 for taxon in self.taxlabels: 2114 sequence = str(self.matrix[taxon]) 2115 length = len(sequence) 2116 start, end = get_start_end(sequence, skiplist=replace) 2117 if start == -1 and end == -1: 2118 sequence = missing * length 2119 else: 2120 sequence = sequence[: end + 1] + missing * (length - end - 1) 2121 sequence = start * missing + sequence[start:] 2122 if length != len(sequence): 2123 raise RuntimeError( 2124 "Illegal sequence manipulation in " 2125 "Nexus.terminal_gap_to_missing in taxon %s" % taxon 2126 ) 2127 self.matrix[taxon] = Seq(sequence) 2128 2129 2130try: 2131 import cnexus 2132except ImportError: 2133 2134 def _get_command_lines(file_contents): 2135 lines = _kill_comments_and_break_lines(file_contents) 2136 commandlines = _adjust_lines(lines) 2137 return commandlines 2138 2139 2140else: 2141 2142 def _get_command_lines(file_contents): 2143 decommented = cnexus.scanfile(file_contents) 2144 # check for unmatched parentheses 2145 if decommented == "[" or decommented == "]": 2146 raise NexusError("Unmatched %s" % decommented) 2147 # cnexus can't return lists, so in analogy we separate 2148 # commandlines with chr(7) (a character that shouldn't be part of a 2149 # nexus file under normal circumstances) 2150 commandlines = _adjust_lines(decommented.split(chr(7))) 2151 return commandlines 2152 2153 2154if __name__ == "__main__": 2155 from Bio._utils import run_doctest 2156 2157 run_doctest() 2158