1 /* 2 * Jalview - A Sequence Alignment Editor and Viewer (2.11.1.4) 3 * Copyright (C) 2021 The Jalview Authors 4 * 5 * This file is part of Jalview. 6 * 7 * Jalview is free software: you can redistribute it and/or 8 * modify it under the terms of the GNU General Public License 9 * as published by the Free Software Foundation, either version 3 10 * of the License, or (at your option) any later version. 11 * 12 * Jalview is distributed in the hope that it will be useful, but 13 * WITHOUT ANY WARRANTY; without even the implied warranty 14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR 15 * PURPOSE. See the GNU General Public License for more details. 16 * 17 * You should have received a copy of the GNU General Public License 18 * along with Jalview. If not, see <http://www.gnu.org/licenses/>. 19 * The Jalview Authors are detailed in the 'AUTHORS' file. 20 */ 21 /* 22 * This extension was written by Benjamin Schuster-Boeckler at sanger.ac.uk 23 */ 24 package jalview.io; 25 26 import jalview.analysis.Rna; 27 import jalview.datamodel.AlignmentAnnotation; 28 import jalview.datamodel.AlignmentI; 29 import jalview.datamodel.Annotation; 30 import jalview.datamodel.DBRefEntry; 31 import jalview.datamodel.DBRefSource; 32 import jalview.datamodel.Mapping; 33 import jalview.datamodel.Sequence; 34 import jalview.datamodel.SequenceFeature; 35 import jalview.datamodel.SequenceI; 36 import jalview.schemes.ResidueProperties; 37 import jalview.util.Comparison; 38 import jalview.util.DBRefUtils; 39 import jalview.util.Format; 40 import jalview.util.MessageManager; 41 42 import java.io.BufferedReader; 43 import java.io.FileReader; 44 import java.io.IOException; 45 import java.util.ArrayList; 46 import java.util.Enumeration; 47 import java.util.Hashtable; 48 import java.util.LinkedHashMap; 49 import java.util.List; 50 import java.util.Map; 51 import java.util.Vector; 52 53 import com.stevesoft.pat.Regex; 54 55 import fr.orsay.lri.varna.exceptions.ExceptionUnmatchedClosingParentheses; 56 import fr.orsay.lri.varna.factories.RNAFactory; 57 import fr.orsay.lri.varna.models.rna.RNA; 58 59 // import org.apache.log4j.*; 60 61 /** 62 * This class is supposed to parse a Stockholm format file into Jalview There 63 * are TODOs in this class: we do not know what the database source and version 64 * is for the file when parsing the #GS= AC tag which associates accessions with 65 * sequences. Database references are also not parsed correctly: a separate 66 * reference string parser must be added to parse the database reference form 67 * into Jalview's local representation. 68 * 69 * @author bsb at sanger.ac.uk 70 * @author Natasha Shersnev (Dundee, UK) (Stockholm file writer) 71 * @author Lauren Lui (UCSC, USA) (RNA secondary structure annotation import as 72 * stockholm) 73 * @author Anne Menard (Paris, FR) (VARNA parsing of Stockholm file data) 74 * @version 0.3 + jalview mods 75 * 76 */ 77 public class StockholmFile extends AlignFile 78 { 79 private static final String ANNOTATION = "annotation"; 80 81 private static final Regex OPEN_PAREN = new Regex("(<|\\[)", "("); 82 83 private static final Regex CLOSE_PAREN = new Regex("(>|\\])", ")"); 84 85 public static final Regex DETECT_BRACKETS = new Regex( 86 "(<|>|\\[|\\]|\\(|\\)|\\{|\\})"); 87 88 // WUSS extended symbols. Avoid ambiguity with protein SS annotations by using NOT_RNASS first. 89 public static final String RNASS_BRACKETS = "<>[](){}AaBbCcDdEeFfGgHhIiJjKkLlMmNnOoPpQqRrSsTtUuVvWwXxYyZz"; 90 91 // use the following regex to decide an annotations (whole) line is NOT an RNA 92 // SS (it contains only E,H,e,h and other non-brace/non-alpha chars) 93 private static final Regex NOT_RNASS = new Regex( 94 "^[^<>[\\](){}A-DF-Za-df-z]*$"); 95 96 StringBuffer out; // output buffer 97 98 AlignmentI al; 99 StockholmFile()100 public StockholmFile() 101 { 102 } 103 104 /** 105 * Creates a new StockholmFile object for output. 106 */ StockholmFile(AlignmentI al)107 public StockholmFile(AlignmentI al) 108 { 109 this.al = al; 110 } 111 StockholmFile(String inFile, DataSourceType type)112 public StockholmFile(String inFile, DataSourceType type) 113 throws IOException 114 { 115 super(inFile, type); 116 } 117 StockholmFile(FileParse source)118 public StockholmFile(FileParse source) throws IOException 119 { 120 super(source); 121 } 122 123 @Override initData()124 public void initData() 125 { 126 super.initData(); 127 } 128 129 /** 130 * Parse a file in Stockholm format into Jalview's data model using VARNA 131 * 132 * @throws IOException 133 * If there is an error with the input file 134 */ parse_with_VARNA(java.io.File inFile)135 public void parse_with_VARNA(java.io.File inFile) throws IOException 136 { 137 FileReader fr = null; 138 fr = new FileReader(inFile); 139 140 BufferedReader r = new BufferedReader(fr); 141 List<RNA> result = null; 142 try 143 { 144 result = RNAFactory.loadSecStrStockholm(r); 145 } catch (ExceptionUnmatchedClosingParentheses umcp) 146 { 147 errormessage = "Unmatched parentheses in annotation. Aborting (" 148 + umcp.getMessage() + ")"; 149 throw new IOException(umcp); 150 } 151 // DEBUG System.out.println("this is the secondary scructure:" 152 // +result.size()); 153 SequenceI[] seqs = new SequenceI[result.size()]; 154 String id = null; 155 for (int i = 0; i < result.size(); i++) 156 { 157 // DEBUG System.err.println("Processing i'th sequence in Stockholm file") 158 RNA current = result.get(i); 159 160 String seq = current.getSeq(); 161 String rna = current.getStructDBN(true); 162 // DEBUG System.out.println(seq); 163 // DEBUG System.err.println(rna); 164 int begin = 0; 165 int end = seq.length() - 1; 166 id = safeName(getDataName()); 167 seqs[i] = new Sequence(id, seq, begin, end); 168 String[] annot = new String[rna.length()]; 169 Annotation[] ann = new Annotation[rna.length()]; 170 for (int j = 0; j < rna.length(); j++) 171 { 172 annot[j] = rna.substring(j, j + 1); 173 174 } 175 176 for (int k = 0; k < rna.length(); k++) 177 { 178 ann[k] = new Annotation(annot[k], "", 179 Rna.getRNASecStrucState(annot[k]).charAt(0), 0f); 180 181 } 182 AlignmentAnnotation align = new AlignmentAnnotation("Sec. str.", 183 current.getID(), ann); 184 185 seqs[i].addAlignmentAnnotation(align); 186 seqs[i].setRNA(result.get(i)); 187 this.annotations.addElement(align); 188 } 189 this.setSeqs(seqs); 190 191 } 192 193 /** 194 * Parse a file in Stockholm format into Jalview's data model. The file has to 195 * be passed at construction time 196 * 197 * @throws IOException 198 * If there is an error with the input file 199 */ 200 @Override parse()201 public void parse() throws IOException 202 { 203 StringBuffer treeString = new StringBuffer(); 204 String treeName = null; 205 // --------------- Variable Definitions ------------------- 206 String line; 207 String version; 208 // String id; 209 Hashtable seqAnn = new Hashtable(); // Sequence related annotations 210 LinkedHashMap<String, String> seqs = new LinkedHashMap<>(); 211 Regex p, r, rend, s, x; 212 // Temporary line for processing RNA annotation 213 // String RNAannot = ""; 214 215 // ------------------ Parsing File ---------------------- 216 // First, we have to check that this file has STOCKHOLM format, i.e. the 217 // first line must match 218 219 r = new Regex("# STOCKHOLM ([\\d\\.]+)"); 220 if (!r.search(nextLine())) 221 { 222 throw new IOException(MessageManager 223 .getString("exception.stockholm_invalid_format")); 224 } 225 else 226 { 227 version = r.stringMatched(1); 228 229 // logger.debug("Stockholm version: " + version); 230 } 231 232 // We define some Regexes here that will be used regularily later 233 rend = new Regex("^\\s*\\/\\/"); // Find the end of an alignment 234 p = new Regex("(\\S+)\\/(\\d+)\\-(\\d+)"); // split sequence id in 235 // id/from/to 236 s = new Regex("(\\S+)\\s+(\\S*)\\s+(.*)"); // Parses annotation subtype 237 r = new Regex("#=(G[FSRC]?)\\s+(.*)"); // Finds any annotation line 238 x = new Regex("(\\S+)\\s+(\\S+)"); // split id from sequence 239 240 // Convert all bracket types to parentheses (necessary for passing to VARNA) 241 Regex openparen = new Regex("(<|\\[)", "("); 242 Regex closeparen = new Regex("(>|\\])", ")"); 243 244 // Detect if file is RNA by looking for bracket types 245 Regex detectbrackets = new Regex("(<|>|\\[|\\]|\\(|\\))"); 246 247 rend.optimize(); 248 p.optimize(); 249 s.optimize(); 250 r.optimize(); 251 x.optimize(); 252 openparen.optimize(); 253 closeparen.optimize(); 254 255 while ((line = nextLine()) != null) 256 { 257 if (line.length() == 0) 258 { 259 continue; 260 } 261 if (rend.search(line)) 262 { 263 // End of the alignment, pass stuff back 264 this.noSeqs = seqs.size(); 265 266 String seqdb, dbsource = null; 267 Regex pf = new Regex("PF[0-9]{5}(.*)"); // Finds AC for Pfam 268 Regex rf = new Regex("RF[0-9]{5}(.*)"); // Finds AC for Rfam 269 if (getAlignmentProperty("AC") != null) 270 { 271 String dbType = getAlignmentProperty("AC").toString(); 272 if (pf.search(dbType)) 273 { 274 // PFAM Alignment - so references are typically from Uniprot 275 dbsource = "PFAM"; 276 } 277 else if (rf.search(dbType)) 278 { 279 dbsource = "RFAM"; 280 } 281 } 282 // logger.debug("Number of sequences: " + this.noSeqs); 283 for (Map.Entry<String, String> skey : seqs.entrySet()) 284 { 285 // logger.debug("Processing sequence " + acc); 286 String acc = skey.getKey(); 287 String seq = skey.getValue(); 288 if (maxLength < seq.length()) 289 { 290 maxLength = seq.length(); 291 } 292 int start = 1; 293 int end = -1; 294 String sid = acc; 295 /* 296 * Retrieve hash of annotations for this accession Associate 297 * Annotation with accession 298 */ 299 Hashtable accAnnotations = null; 300 301 if (seqAnn != null && seqAnn.containsKey(acc)) 302 { 303 accAnnotations = (Hashtable) seqAnn.remove(acc); 304 // TODO: add structures to sequence 305 } 306 307 // Split accession in id and from/to 308 if (p.search(acc)) 309 { 310 sid = p.stringMatched(1); 311 start = Integer.parseInt(p.stringMatched(2)); 312 end = Integer.parseInt(p.stringMatched(3)); 313 } 314 // logger.debug(sid + ", " + start + ", " + end); 315 316 Sequence seqO = new Sequence(sid, seq, start, end); 317 // Add Description (if any) 318 if (accAnnotations != null && accAnnotations.containsKey("DE")) 319 { 320 String desc = (String) accAnnotations.get("DE"); 321 seqO.setDescription((desc == null) ? "" : desc); 322 } 323 // Add DB References (if any) 324 if (accAnnotations != null && accAnnotations.containsKey("DR")) 325 { 326 String dbr = (String) accAnnotations.get("DR"); 327 if (dbr != null && dbr.indexOf(";") > -1) 328 { 329 String src = dbr.substring(0, dbr.indexOf(";")); 330 String acn = dbr.substring(dbr.indexOf(";") + 1); 331 jalview.util.DBRefUtils.parseToDbRef(seqO, src, "0", acn); 332 } 333 } 334 335 if (accAnnotations != null && accAnnotations.containsKey("AC")) 336 { 337 String dbr = (String) accAnnotations.get("AC"); 338 if (dbr != null) 339 { 340 // we could get very clever here - but for now - just try to 341 // guess accession type from type of sequence, source of alignment plus 342 // structure 343 // of accession 344 guessDatabaseFor(seqO, dbr, dbsource); 345 } 346 // else - do what ? add the data anyway and prompt the user to 347 // specify what references these are ? 348 } 349 350 Hashtable features = null; 351 // We need to adjust the positions of all features to account for gaps 352 try 353 { 354 features = (Hashtable) accAnnotations.remove("features"); 355 } catch (java.lang.NullPointerException e) 356 { 357 // loggerwarn("Getting Features for " + acc + ": " + 358 // e.getMessage()); 359 // continue; 360 } 361 // if we have features 362 if (features != null) 363 { 364 int posmap[] = seqO.findPositionMap(); 365 Enumeration i = features.keys(); 366 while (i.hasMoreElements()) 367 { 368 // TODO: parse out secondary structure annotation as annotation 369 // row 370 // TODO: parse out scores as annotation row 371 // TODO: map coding region to core jalview feature types 372 String type = i.nextElement().toString(); 373 Hashtable content = (Hashtable) features.remove(type); 374 375 // add alignment annotation for this feature 376 String key = type2id(type); 377 378 /* 379 * have we added annotation rows for this type ? 380 */ 381 boolean annotsAdded = false; 382 if (key != null) 383 { 384 if (accAnnotations != null 385 && accAnnotations.containsKey(key)) 386 { 387 Vector vv = (Vector) accAnnotations.get(key); 388 for (int ii = 0; ii < vv.size(); ii++) 389 { 390 annotsAdded = true; 391 AlignmentAnnotation an = (AlignmentAnnotation) vv 392 .elementAt(ii); 393 seqO.addAlignmentAnnotation(an); 394 annotations.add(an); 395 } 396 } 397 } 398 399 Enumeration j = content.keys(); 400 while (j.hasMoreElements()) 401 { 402 String desc = j.nextElement().toString(); 403 if (ANNOTATION.equals(desc) && annotsAdded) 404 { 405 // don't add features if we already added an annotation row 406 continue; 407 } 408 String ns = content.get(desc).toString(); 409 char[] byChar = ns.toCharArray(); 410 for (int k = 0; k < byChar.length; k++) 411 { 412 char c = byChar[k]; 413 if (!(c == ' ' || c == '_' || c == '-' || c == '.')) // PFAM 414 // uses 415 // '.' 416 // for 417 // feature 418 // background 419 { 420 int new_pos = posmap[k]; // look up nearest seqeunce 421 // position to this column 422 SequenceFeature feat = new SequenceFeature(type, desc, 423 new_pos, new_pos, null); 424 425 seqO.addSequenceFeature(feat); 426 } 427 } 428 } 429 430 } 431 432 } 433 // garbage collect 434 435 // logger.debug("Adding seq " + acc + " from " + start + " to " + end 436 // + ": " + seq); 437 this.seqs.addElement(seqO); 438 } 439 return; // finished parsing this segment of source 440 } 441 else if (!r.search(line)) 442 { 443 // System.err.println("Found sequence line: " + line); 444 445 // Split sequence in sequence and accession parts 446 if (!x.search(line)) 447 { 448 // logger.error("Could not parse sequence line: " + line); 449 throw new IOException(MessageManager.formatMessage( 450 "exception.couldnt_parse_sequence_line", new String[] 451 { line })); 452 } 453 String ns = seqs.get(x.stringMatched(1)); 454 if (ns == null) 455 { 456 ns = ""; 457 } 458 ns += x.stringMatched(2); 459 460 seqs.put(x.stringMatched(1), ns); 461 } 462 else 463 { 464 String annType = r.stringMatched(1); 465 String annContent = r.stringMatched(2); 466 467 // System.err.println("type:" + annType + " content: " + annContent); 468 469 if (annType.equals("GF")) 470 { 471 /* 472 * Generic per-File annotation, free text Magic features: #=GF NH 473 * <tree in New Hampshire eXtended format> #=GF TN <Unique identifier 474 * for the next tree> Pfam descriptions: 7. DESCRIPTION OF FIELDS 475 * 476 * Compulsory fields: ------------------ 477 * 478 * AC Accession number: Accession number in form PFxxxxx.version or 479 * PBxxxxxx. ID Identification: One word name for family. DE 480 * Definition: Short description of family. AU Author: Authors of the 481 * entry. SE Source of seed: The source suggesting the seed members 482 * belong to one family. GA Gathering method: Search threshold to 483 * build the full alignment. TC Trusted Cutoff: Lowest sequence score 484 * and domain score of match in the full alignment. NC Noise Cutoff: 485 * Highest sequence score and domain score of match not in full 486 * alignment. TP Type: Type of family -- presently Family, Domain, 487 * Motif or Repeat. SQ Sequence: Number of sequences in alignment. AM 488 * Alignment Method The order ls and fs hits are aligned to the model 489 * to build the full align. // End of alignment. 490 * 491 * Optional fields: ---------------- 492 * 493 * DC Database Comment: Comment about database reference. DR Database 494 * Reference: Reference to external database. RC Reference Comment: 495 * Comment about literature reference. RN Reference Number: Reference 496 * Number. RM Reference Medline: Eight digit medline UI number. RT 497 * Reference Title: Reference Title. RA Reference Author: Reference 498 * Author RL Reference Location: Journal location. PI Previous 499 * identifier: Record of all previous ID lines. KW Keywords: Keywords. 500 * CC Comment: Comments. NE Pfam accession: Indicates a nested domain. 501 * NL Location: Location of nested domains - sequence ID, start and 502 * end of insert. 503 * 504 * Obsolete fields: ----------- AL Alignment method of seed: The 505 * method used to align the seed members. 506 */ 507 // Let's save the annotations, maybe we'll be able to do something 508 // with them later... 509 Regex an = new Regex("(\\w+)\\s*(.*)"); 510 if (an.search(annContent)) 511 { 512 if (an.stringMatched(1).equals("NH")) 513 { 514 treeString.append(an.stringMatched(2)); 515 } 516 else if (an.stringMatched(1).equals("TN")) 517 { 518 if (treeString.length() > 0) 519 { 520 if (treeName == null) 521 { 522 treeName = "Tree " + (getTreeCount() + 1); 523 } 524 addNewickTree(treeName, treeString.toString()); 525 } 526 treeName = an.stringMatched(2); 527 treeString = new StringBuffer(); 528 } 529 // TODO: JAL-3532 - this is where GF comments and database references are lost 530 // suggest overriding this method for Stockholm files to catch and properly 531 // process CC, DR etc into multivalued properties 532 setAlignmentProperty(an.stringMatched(1), an.stringMatched(2)); 533 } 534 } 535 else if (annType.equals("GS")) 536 { 537 // Generic per-Sequence annotation, free text 538 /* 539 * Pfam uses these features: Feature Description --------------------- 540 * ----------- AC <accession> ACcession number DE <freetext> 541 * DEscription DR <db>; <accession>; Database Reference OS <organism> 542 * OrganiSm (species) OC <clade> Organism Classification (clade, etc.) 543 * LO <look> Look (Color, etc.) 544 */ 545 if (s.search(annContent)) 546 { 547 String acc = s.stringMatched(1); 548 String type = s.stringMatched(2); 549 String content = s.stringMatched(3); 550 // TODO: store DR in a vector. 551 // TODO: store AC according to generic file db annotation. 552 Hashtable ann; 553 if (seqAnn.containsKey(acc)) 554 { 555 ann = (Hashtable) seqAnn.get(acc); 556 } 557 else 558 { 559 ann = new Hashtable(); 560 } 561 ann.put(type, content); 562 seqAnn.put(acc, ann); 563 } 564 else 565 { 566 // throw new IOException("Error parsing " + line); 567 System.err.println(">> missing annotation: " + line); 568 } 569 } 570 else if (annType.equals("GC")) 571 { 572 // Generic per-Column annotation, exactly 1 char per column 573 // always need a label. 574 if (x.search(annContent)) 575 { 576 // parse out and create alignment annotation directly. 577 parseAnnotationRow(annotations, x.stringMatched(1), 578 x.stringMatched(2)); 579 } 580 } 581 else if (annType.equals("GR")) 582 { 583 // Generic per-Sequence AND per-Column markup, exactly 1 char per 584 // column 585 /* 586 * Feature Description Markup letters ------- ----------- 587 * -------------- SS Secondary Structure [HGIEBTSCX] SA Surface 588 * Accessibility [0-9X] (0=0%-10%; ...; 9=90%-100%) TM TransMembrane 589 * [Mio] PP Posterior Probability [0-9*] (0=0.00-0.05; 1=0.05-0.15; 590 * *=0.95-1.00) LI LIgand binding [*] AS Active Site [*] IN INtron (in 591 * or after) [0-2] 592 */ 593 if (s.search(annContent)) 594 { 595 String acc = s.stringMatched(1); 596 String type = s.stringMatched(2); 597 String oseq = s.stringMatched(3); 598 /* 599 * copy of annotation field that may be processed into whitespace chunks 600 */ 601 String seq = new String(oseq); 602 603 Hashtable ann; 604 // Get an object with all the annotations for this sequence 605 if (seqAnn.containsKey(acc)) 606 { 607 // logger.debug("Found annotations for " + acc); 608 ann = (Hashtable) seqAnn.get(acc); 609 } 610 else 611 { 612 // logger.debug("Creating new annotations holder for " + acc); 613 ann = new Hashtable(); 614 seqAnn.put(acc, ann); 615 } 616 617 // // start of block for appending annotation lines for wrapped 618 // stokchholm file 619 // TODO test structure, call parseAnnotationRow with vector from 620 // hashtable for specific sequence 621 622 Hashtable features; 623 // Get an object with all the content for an annotation 624 if (ann.containsKey("features")) 625 { 626 // logger.debug("Found features for " + acc); 627 features = (Hashtable) ann.get("features"); 628 } 629 else 630 { 631 // logger.debug("Creating new features holder for " + acc); 632 features = new Hashtable(); 633 ann.put("features", features); 634 } 635 636 Hashtable content; 637 if (features.containsKey(this.id2type(type))) 638 { 639 // logger.debug("Found content for " + this.id2type(type)); 640 content = (Hashtable) features.get(this.id2type(type)); 641 } 642 else 643 { 644 // logger.debug("Creating new content holder for " + 645 // this.id2type(type)); 646 content = new Hashtable(); 647 features.put(this.id2type(type), content); 648 } 649 String ns = (String) content.get(ANNOTATION); 650 651 if (ns == null) 652 { 653 ns = ""; 654 } 655 // finally, append the annotation line 656 ns += seq; 657 content.put(ANNOTATION, ns); 658 // // end of wrapped annotation block. 659 // // Now a new row is created with the current set of data 660 661 Hashtable strucAnn; 662 if (seqAnn.containsKey(acc)) 663 { 664 strucAnn = (Hashtable) seqAnn.get(acc); 665 } 666 else 667 { 668 strucAnn = new Hashtable(); 669 } 670 671 Vector<AlignmentAnnotation> newStruc = new Vector<>(); 672 parseAnnotationRow(newStruc, type, ns); 673 for (AlignmentAnnotation alan : newStruc) 674 { 675 alan.visible = false; 676 } 677 // new annotation overwrites any existing annotation... 678 679 strucAnn.put(type, newStruc); 680 seqAnn.put(acc, strucAnn); 681 } 682 // } 683 else 684 { 685 System.err.println( 686 "Warning - couldn't parse sequence annotation row line:\n" 687 + line); 688 // throw new IOException("Error parsing " + line); 689 } 690 } 691 else 692 { 693 throw new IOException(MessageManager.formatMessage( 694 "exception.unknown_annotation_detected", new String[] 695 { annType, annContent })); 696 } 697 } 698 } 699 if (treeString.length() > 0) 700 { 701 if (treeName == null) 702 { 703 treeName = "Tree " + (1 + getTreeCount()); 704 } 705 addNewickTree(treeName, treeString.toString()); 706 } 707 } 708 709 /** 710 * Demangle an accession string and guess the originating sequence database 711 * for a given sequence 712 * 713 * @param seqO 714 * sequence to be annotated 715 * @param dbr 716 * Accession string for sequence 717 * @param dbsource 718 * source database for alignment (PFAM or RFAM) 719 */ guessDatabaseFor(Sequence seqO, String dbr, String dbsource)720 private void guessDatabaseFor(Sequence seqO, String dbr, String dbsource) 721 { 722 DBRefEntry dbrf = null; 723 List<DBRefEntry> dbrs = new ArrayList<>(); 724 String seqdb = "Unknown", sdbac = "" + dbr; 725 int st = -1, en = -1, p; 726 if ((st = sdbac.indexOf("/")) > -1) 727 { 728 String num, range = sdbac.substring(st + 1); 729 sdbac = sdbac.substring(0, st); 730 if ((p = range.indexOf("-")) > -1) 731 { 732 p++; 733 if (p < range.length()) 734 { 735 num = range.substring(p).trim(); 736 try 737 { 738 en = Integer.parseInt(num); 739 } catch (NumberFormatException x) 740 { 741 // could warn here that index is invalid 742 en = -1; 743 } 744 } 745 } 746 else 747 { 748 p = range.length(); 749 } 750 num = range.substring(0, p).trim(); 751 try 752 { 753 st = Integer.parseInt(num); 754 } catch (NumberFormatException x) 755 { 756 // could warn here that index is invalid 757 st = -1; 758 } 759 } 760 if (dbsource == null) 761 { 762 // make up an origin based on whether the sequence looks like it is nucleotide 763 // or protein 764 dbsource = (seqO.isProtein()) ? "PFAM" : "RFAM"; 765 } 766 if (dbsource.equals("PFAM")) 767 { 768 seqdb = "UNIPROT"; 769 if (sdbac.indexOf(".") > -1) 770 { 771 // strip of last subdomain 772 sdbac = sdbac.substring(0, sdbac.indexOf(".")); 773 dbrf = jalview.util.DBRefUtils.parseToDbRef(seqO, seqdb, dbsource, 774 sdbac); 775 if (dbrf != null) 776 { 777 dbrs.add(dbrf); 778 } 779 } 780 dbrf = jalview.util.DBRefUtils.parseToDbRef(seqO, dbsource, dbsource, 781 dbr); 782 if (dbr != null) 783 { 784 dbrs.add(dbrf); 785 } 786 } 787 else 788 { 789 seqdb = "EMBL"; // total guess - could be ENA, or something else these 790 // days 791 if (sdbac.indexOf(".") > -1) 792 { 793 // strip off last subdomain 794 sdbac = sdbac.substring(0, sdbac.indexOf(".")); 795 dbrf = jalview.util.DBRefUtils.parseToDbRef(seqO, seqdb, dbsource, 796 sdbac); 797 if (dbrf != null) 798 { 799 dbrs.add(dbrf); 800 } 801 } 802 803 dbrf = jalview.util.DBRefUtils.parseToDbRef(seqO, dbsource, dbsource, 804 dbr); 805 if (dbrf != null) 806 { 807 dbrs.add(dbrf); 808 } 809 } 810 if (st != -1 && en != -1) 811 { 812 for (DBRefEntry d : dbrs) 813 { 814 jalview.util.MapList mp = new jalview.util.MapList( 815 new int[] 816 { seqO.getStart(), seqO.getEnd() }, new int[] { st, en }, 1, 817 1); 818 jalview.datamodel.Mapping mping = new Mapping(mp); 819 d.setMap(mping); 820 } 821 } 822 } 823 parseAnnotationRow( Vector<AlignmentAnnotation> annotation, String label, String annots)824 protected static AlignmentAnnotation parseAnnotationRow( 825 Vector<AlignmentAnnotation> annotation, String label, 826 String annots) 827 { 828 String convert1, convert2 = null; 829 830 // convert1 = OPEN_PAREN.replaceAll(annots); 831 // convert2 = CLOSE_PAREN.replaceAll(convert1); 832 // annots = convert2; 833 834 String type = label; 835 if (label.contains("_cons")) 836 { 837 type = (label.indexOf("_cons") == label.length() - 5) 838 ? label.substring(0, label.length() - 5) 839 : label; 840 } 841 boolean ss = false, posterior = false; 842 type = id2type(type); 843 844 boolean isrnass = false; 845 if (type.equalsIgnoreCase("secondary structure")) 846 { 847 ss = true; 848 isrnass = !NOT_RNASS.search(annots); // sorry about the double negative 849 // here (it's easier for dealing with 850 // other non-alpha-non-brace chars) 851 } 852 if (type.equalsIgnoreCase("posterior probability")) 853 { 854 posterior = true; 855 } 856 // decide on secondary structure or not. 857 Annotation[] els = new Annotation[annots.length()]; 858 for (int i = 0; i < annots.length(); i++) 859 { 860 String pos = annots.substring(i, i + 1); 861 Annotation ann; 862 ann = new Annotation(pos, "", ' ', 0f); // 0f is 'valid' null - will not 863 // be written out 864 if (ss) 865 { 866 // if (" .-_".indexOf(pos) == -1) 867 { 868 if (isrnass && RNASS_BRACKETS.indexOf(pos) >= 0) 869 { 870 ann.secondaryStructure = Rna.getRNASecStrucState(pos).charAt(0); 871 ann.displayCharacter = "" + pos.charAt(0); 872 } 873 else 874 { 875 ann.secondaryStructure = ResidueProperties.getDssp3state(pos) 876 .charAt(0); 877 878 if (ann.secondaryStructure == pos.charAt(0)) 879 { 880 ann.displayCharacter = ""; // null; // " "; 881 } 882 else 883 { 884 ann.displayCharacter = " " + ann.displayCharacter; 885 } 886 } 887 } 888 889 } 890 if (posterior && !ann.isWhitespace() 891 && !Comparison.isGap(pos.charAt(0))) 892 { 893 float val = 0; 894 // symbol encodes values - 0..*==0..10 895 if (pos.charAt(0) == '*') 896 { 897 val = 10; 898 } 899 else 900 { 901 val = pos.charAt(0) - '0'; 902 if (val > 9) 903 { 904 val = 10; 905 } 906 } 907 ann.value = val; 908 } 909 910 els[i] = ann; 911 } 912 AlignmentAnnotation annot = null; 913 Enumeration<AlignmentAnnotation> e = annotation.elements(); 914 while (e.hasMoreElements()) 915 { 916 annot = e.nextElement(); 917 if (annot.label.equals(type)) 918 { 919 break; 920 } 921 annot = null; 922 } 923 if (annot == null) 924 { 925 annot = new AlignmentAnnotation(type, type, els); 926 annotation.addElement(annot); 927 } 928 else 929 { 930 Annotation[] anns = new Annotation[annot.annotations.length 931 + els.length]; 932 System.arraycopy(annot.annotations, 0, anns, 0, 933 annot.annotations.length); 934 System.arraycopy(els, 0, anns, annot.annotations.length, els.length); 935 annot.annotations = anns; 936 // System.out.println("else: "); 937 } 938 return annot; 939 } 940 dbref_to_ac_record(DBRefEntry ref)941 private String dbref_to_ac_record(DBRefEntry ref) 942 { 943 return ref.getSource().toString() + " ; " 944 + ref.getAccessionId().toString(); 945 } 946 @Override print(SequenceI[] s, boolean jvSuffix)947 public String print(SequenceI[] s, boolean jvSuffix) 948 { 949 out = new StringBuffer(); 950 out.append("# STOCKHOLM 1.0"); 951 out.append(newline); 952 953 // find max length of id 954 int max = 0; 955 int maxid = 0; 956 int in = 0; 957 Hashtable dataRef = null; 958 boolean isAA = s[in].isProtein(); 959 while ((in < s.length) && (s[in] != null)) 960 { 961 962 String tmp = printId(s[in], jvSuffix); 963 max = Math.max(max, s[in].getLength()); 964 965 if (tmp.length() > maxid) 966 { 967 maxid = tmp.length(); 968 } 969 if (s[in].getDBRefs() != null) 970 { 971 if (dataRef == null) 972 { 973 dataRef = new Hashtable(); 974 } 975 List<DBRefEntry> primrefs = s[in].getPrimaryDBRefs(); 976 if (primrefs.size() >= 1) 977 { 978 dataRef.put(tmp, dbref_to_ac_record(primrefs.get(0))); 979 } 980 else 981 { 982 for (int idb = 0; idb < s[in].getDBRefs().length; idb++) 983 { 984 DBRefEntry dbref = s[in].getDBRefs()[idb]; 985 dataRef.put(tmp, dbref_to_ac_record(dbref)); 986 // if we put in a uniprot or EMBL record then we're done: 987 if (isAA && DBRefSource.UNIPROT 988 .equals(DBRefUtils.getCanonicalName(dbref.getSource()))) 989 { 990 break; 991 } 992 if (!isAA && DBRefSource.EMBL 993 .equals(DBRefUtils.getCanonicalName(dbref.getSource()))) 994 { 995 break; 996 } 997 } 998 } 999 } 1000 in++; 1001 } 1002 maxid += 9; 1003 int i = 0; 1004 1005 // output database type 1006 if (al.getProperties() != null) 1007 { 1008 if (!al.getProperties().isEmpty()) 1009 { 1010 Enumeration key = al.getProperties().keys(); 1011 Enumeration val = al.getProperties().elements(); 1012 while (key.hasMoreElements()) 1013 { 1014 out.append("#=GF " + key.nextElement() + " " + val.nextElement()); 1015 out.append(newline); 1016 } 1017 } 1018 } 1019 1020 // output database accessions 1021 if (dataRef != null) 1022 { 1023 Enumeration en = dataRef.keys(); 1024 while (en.hasMoreElements()) 1025 { 1026 Object idd = en.nextElement(); 1027 String type = (String) dataRef.remove(idd); 1028 out.append(new Format("%-" + (maxid - 2) + "s") 1029 .form("#=GS " + idd.toString() + " ")); 1030 if (isAA && type.contains("UNIPROT") 1031 || (!isAA && type.contains("EMBL"))) 1032 { 1033 1034 out.append(" AC " + type.substring(type.indexOf(";") + 1)); 1035 } 1036 else 1037 { 1038 out.append(" DR " + type + " "); 1039 } 1040 out.append(newline); 1041 } 1042 } 1043 1044 // output annotations 1045 while (i < s.length && s[i] != null) 1046 { 1047 AlignmentAnnotation[] alAnot = s[i].getAnnotation(); 1048 if (alAnot != null) 1049 { 1050 Annotation[] ann; 1051 for (int j = 0; j < alAnot.length; j++) 1052 { 1053 1054 String key = type2id(alAnot[j].label); 1055 boolean isrna = alAnot[j].isValidStruc(); 1056 1057 if (isrna) 1058 { 1059 // hardwire to secondary structure if there is RNA secondary 1060 // structure on the annotation 1061 key = "SS"; 1062 } 1063 if (key == null) 1064 { 1065 1066 continue; 1067 } 1068 1069 // out.append("#=GR "); 1070 out.append(new Format("%-" + maxid + "s").form( 1071 "#=GR " + printId(s[i], jvSuffix) + " " + key + " ")); 1072 ann = alAnot[j].annotations; 1073 String seq = ""; 1074 for (int k = 0; k < ann.length; k++) 1075 { 1076 seq += outputCharacter(key, k, isrna, ann, s[i]); 1077 } 1078 out.append(seq); 1079 out.append(newline); 1080 } 1081 } 1082 1083 out.append(new Format("%-" + maxid + "s") 1084 .form(printId(s[i], jvSuffix) + " ")); 1085 out.append(s[i].getSequenceAsString()); 1086 out.append(newline); 1087 i++; 1088 } 1089 1090 // alignment annotation 1091 AlignmentAnnotation aa; 1092 if (al.getAlignmentAnnotation() != null) 1093 { 1094 for (int ia = 0; ia < al.getAlignmentAnnotation().length; ia++) 1095 { 1096 aa = al.getAlignmentAnnotation()[ia]; 1097 if (aa.autoCalculated || !aa.visible || aa.sequenceRef != null) 1098 { 1099 continue; 1100 } 1101 String seq = ""; 1102 String label; 1103 String key = ""; 1104 if (aa.label.equals("seq")) 1105 { 1106 label = "seq_cons"; 1107 } 1108 else 1109 { 1110 key = type2id(aa.label.toLowerCase()); 1111 if (key == null) 1112 { 1113 label = aa.label; 1114 } 1115 else 1116 { 1117 label = key + "_cons"; 1118 } 1119 } 1120 if (label == null) 1121 { 1122 label = aa.label; 1123 } 1124 label = label.replace(" ", "_"); 1125 1126 out.append( 1127 new Format("%-" + maxid + "s").form("#=GC " + label + " ")); 1128 boolean isrna = aa.isValidStruc(); 1129 for (int j = 0; j < aa.annotations.length; j++) 1130 { 1131 seq += outputCharacter(key, j, isrna, aa.annotations, null); 1132 } 1133 out.append(seq); 1134 out.append(newline); 1135 } 1136 } 1137 1138 out.append("//"); 1139 out.append(newline); 1140 1141 return out.toString(); 1142 } 1143 1144 /** 1145 * add an annotation character to the output row 1146 * 1147 * @param seq 1148 * @param key 1149 * @param k 1150 * @param isrna 1151 * @param ann 1152 * @param sequenceI 1153 */ outputCharacter(String key, int k, boolean isrna, Annotation[] ann, SequenceI sequenceI)1154 private char outputCharacter(String key, int k, boolean isrna, 1155 Annotation[] ann, SequenceI sequenceI) 1156 { 1157 char seq = ' '; 1158 Annotation annot = ann[k]; 1159 String ch = (annot == null) 1160 ? ((sequenceI == null) ? "-" 1161 : Character.toString(sequenceI.getCharAt(k))) 1162 : (annot.displayCharacter == null 1163 ? String.valueOf(annot.secondaryStructure) 1164 : annot.displayCharacter); 1165 if (ch == null) 1166 { 1167 ch = " "; 1168 } 1169 if (key != null && key.equals("SS")) 1170 { 1171 char ssannotchar = ' '; 1172 boolean charset = false; 1173 if (annot == null) 1174 { 1175 // sensible gap character 1176 ssannotchar = ' '; 1177 charset = true; 1178 } 1179 else 1180 { 1181 // valid secondary structure AND no alternative label (e.g. ' B') 1182 if (annot.secondaryStructure > ' ' && ch.length() < 2) 1183 { 1184 ssannotchar = annot.secondaryStructure; 1185 charset = true; 1186 } 1187 } 1188 if (charset) 1189 { 1190 return (ssannotchar == ' ' && isrna) ? '.' : ssannotchar; 1191 } 1192 } 1193 1194 if (ch.length() == 0) 1195 { 1196 seq = '.'; 1197 } 1198 else if (ch.length() == 1) 1199 { 1200 seq = ch.charAt(0); 1201 } 1202 else if (ch.length() > 1) 1203 { 1204 seq = ch.charAt(1); 1205 } 1206 1207 return (seq == ' ' && key != null && key.equals("SS") && isrna) ? '.' 1208 : seq; 1209 } 1210 print()1211 public String print() 1212 { 1213 out = new StringBuffer(); 1214 out.append("# STOCKHOLM 1.0"); 1215 out.append(newline); 1216 print(getSeqsAsArray(), false); 1217 1218 out.append("//"); 1219 out.append(newline); 1220 return out.toString(); 1221 } 1222 1223 private static Hashtable typeIds = null; 1224 1225 static 1226 { 1227 if (typeIds == null) 1228 { 1229 typeIds = new Hashtable(); 1230 typeIds.put("SS", "Secondary Structure"); 1231 typeIds.put("SA", "Surface Accessibility"); 1232 typeIds.put("TM", "transmembrane"); 1233 typeIds.put("PP", "Posterior Probability"); 1234 typeIds.put("LI", "ligand binding"); 1235 typeIds.put("AS", "active site"); 1236 typeIds.put("IN", "intron"); 1237 typeIds.put("IR", "interacting residue"); 1238 typeIds.put("AC", "accession"); 1239 typeIds.put("OS", "organism"); 1240 typeIds.put("CL", "class"); 1241 typeIds.put("DE", "description"); 1242 typeIds.put("DR", "reference"); 1243 typeIds.put("LO", "look"); 1244 typeIds.put("RF", "Reference Positions"); 1245 1246 } 1247 } 1248 id2type(String id)1249 protected static String id2type(String id) 1250 { 1251 if (typeIds.containsKey(id)) 1252 { 1253 return (String) typeIds.get(id); 1254 } 1255 System.err.println( 1256 "Warning : Unknown Stockholm annotation type code " + id); 1257 return id; 1258 } 1259 type2id(String type)1260 protected static String type2id(String type) 1261 { 1262 String key = null; 1263 Enumeration e = typeIds.keys(); 1264 while (e.hasMoreElements()) 1265 { 1266 Object ll = e.nextElement(); 1267 if (typeIds.get(ll).toString().equalsIgnoreCase(type)) 1268 { 1269 key = (String) ll; 1270 break; 1271 } 1272 } 1273 if (key != null) 1274 { 1275 return key; 1276 } 1277 System.err.println( 1278 "Warning : Unknown Stockholm annotation type: " + type); 1279 return key; 1280 } 1281 1282 /** 1283 * make a friendly ID string. 1284 * 1285 * @param dataName 1286 * @return truncated dataName to after last '/' 1287 */ safeName(String dataName)1288 private String safeName(String dataName) 1289 { 1290 int b = 0; 1291 while ((b = dataName.indexOf("/")) > -1 && b < dataName.length()) 1292 { 1293 dataName = dataName.substring(b + 1).trim(); 1294 1295 } 1296 int e = (dataName.length() - dataName.indexOf(".")) + 1; 1297 dataName = dataName.substring(1, e).trim(); 1298 return dataName; 1299 } 1300 } 1301