1 /* 2 * Jalview - A Sequence Alignment Editor and Viewer (2.11.1.4) 3 * Copyright (C) 2021 The Jalview Authors 4 * 5 * This file is part of Jalview. 6 * 7 * Jalview is free software: you can redistribute it and/or 8 * modify it under the terms of the GNU General Public License 9 * as published by the Free Software Foundation, either version 3 10 * of the License, or (at your option) any later version. 11 * 12 * Jalview is distributed in the hope that it will be useful, but 13 * WITHOUT ANY WARRANTY; without even the implied warranty 14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR 15 * PURPOSE. See the GNU General Public License for more details. 16 * 17 * You should have received a copy of the GNU General Public License 18 * along with Jalview. If not, see <http://www.gnu.org/licenses/>. 19 * The Jalview Authors are detailed in the 'AUTHORS' file. 20 */ 21 package jalview.ext.ensembl; 22 23 import jalview.api.FeatureColourI; 24 import jalview.api.FeatureSettingsModelI; 25 import jalview.datamodel.AlignmentI; 26 import jalview.datamodel.GeneLociI; 27 import jalview.datamodel.Sequence; 28 import jalview.datamodel.SequenceFeature; 29 import jalview.datamodel.SequenceI; 30 import jalview.datamodel.features.SequenceFeatures; 31 import jalview.io.gff.SequenceOntologyFactory; 32 import jalview.io.gff.SequenceOntologyI; 33 import jalview.schemes.FeatureColour; 34 import jalview.schemes.FeatureSettingsAdapter; 35 import jalview.util.MapList; 36 37 import java.awt.Color; 38 import java.io.UnsupportedEncodingException; 39 import java.net.URLDecoder; 40 import java.util.ArrayList; 41 import java.util.Arrays; 42 import java.util.List; 43 44 import com.stevesoft.pat.Regex; 45 46 /** 47 * A class that fetches genomic sequence and all transcripts for an Ensembl gene 48 * 49 * @author gmcarstairs 50 */ 51 public class EnsemblGene extends EnsemblSeqProxy 52 { 53 /* 54 * accepts anything as we will attempt lookup of gene or 55 * transcript id or gene name 56 */ 57 private static final Regex ACCESSION_REGEX = new Regex(".*"); 58 59 private static final EnsemblFeatureType[] FEATURES_TO_FETCH = { 60 EnsemblFeatureType.gene, EnsemblFeatureType.transcript, 61 EnsemblFeatureType.exon, EnsemblFeatureType.cds, 62 EnsemblFeatureType.variation }; 63 64 private static final String CHROMOSOME = "chromosome"; 65 66 /** 67 * Default constructor (to use rest.ensembl.org) 68 */ EnsemblGene()69 public EnsemblGene() 70 { 71 super(); 72 } 73 74 /** 75 * Constructor given the target domain to fetch data from 76 * 77 * @param d 78 */ EnsemblGene(String d)79 public EnsemblGene(String d) 80 { 81 super(d); 82 } 83 84 @Override getDbName()85 public String getDbName() 86 { 87 return "ENSEMBL"; 88 } 89 90 @Override getFeaturesToFetch()91 protected EnsemblFeatureType[] getFeaturesToFetch() 92 { 93 return FEATURES_TO_FETCH; 94 } 95 96 @Override getSourceEnsemblType()97 protected EnsemblSeqType getSourceEnsemblType() 98 { 99 return EnsemblSeqType.GENOMIC; 100 } 101 102 @Override getObjectType()103 protected String getObjectType() 104 { 105 return OBJECT_TYPE_GENE; 106 } 107 108 /** 109 * Returns an alignment containing the gene(s) for the given gene or 110 * transcript identifier, or external identifier (e.g. Uniprot id). If given a 111 * gene name or external identifier, returns any related gene sequences found 112 * for model organisms. If only a single gene is queried for, then its 113 * transcripts are also retrieved and added to the alignment. <br> 114 * Method: 115 * <ul> 116 * <li>resolves a transcript identifier by looking up its parent gene id</li> 117 * <li>resolves an external identifier by looking up xref-ed gene ids</li> 118 * <li>fetches the gene sequence</li> 119 * <li>fetches features on the sequence</li> 120 * <li>identifies "transcript" features whose Parent is the requested 121 * gene</li> 122 * <li>fetches the transcript sequence for each transcript</li> 123 * <li>makes a mapping from the gene to each transcript</li> 124 * <li>copies features from gene to transcript sequences</li> 125 * <li>fetches the protein sequence for each transcript, maps and saves it as 126 * a cross-reference</li> 127 * <li>aligns each transcript against the gene sequence based on the position 128 * mappings</li> 129 * </ul> 130 * 131 * @param query 132 * a single gene or transcript identifier or gene name 133 * @return an alignment containing a gene, and possibly transcripts, or null 134 */ 135 @Override getSequenceRecords(String query)136 public AlignmentI getSequenceRecords(String query) throws Exception 137 { 138 /* 139 * convert to a non-duplicated list of gene identifiers 140 */ 141 List<String> geneIds = getGeneIds(query); 142 143 AlignmentI al = null; 144 for (String geneId : geneIds) 145 { 146 /* 147 * fetch the gene sequence(s) with features and xrefs 148 */ 149 AlignmentI geneAlignment = super.getSequenceRecords(geneId); 150 if (geneAlignment == null) 151 { 152 continue; 153 } 154 155 if (geneAlignment.getHeight() == 1) 156 { 157 // ensure id has 'correct' case for the Ensembl identifier 158 geneId = geneAlignment.getSequenceAt(0).getName(); 159 160 findGeneLoci(geneAlignment.getSequenceAt(0), geneId); 161 162 getTranscripts(geneAlignment, geneId); 163 } 164 if (al == null) 165 { 166 al = geneAlignment; 167 } 168 else 169 { 170 al.append(geneAlignment); 171 } 172 } 173 return al; 174 } 175 176 /** 177 * Calls the /lookup/id REST service, parses the response for gene 178 * coordinates, and if successful, adds these to the sequence. If this fails, 179 * fall back on trying to parse the sequence description in case it is in 180 * Ensembl-gene format e.g. chromosome:GRCh38:17:45051610:45109016:1. 181 * 182 * @param seq 183 * @param geneId 184 */ findGeneLoci(SequenceI seq, String geneId)185 void findGeneLoci(SequenceI seq, String geneId) 186 { 187 GeneLociI geneLoci = new EnsemblLookup(getDomain()).getGeneLoci(geneId); 188 if (geneLoci != null) 189 { 190 seq.setGeneLoci(geneLoci.getSpeciesId(), geneLoci.getAssemblyId(), 191 geneLoci.getChromosomeId(), geneLoci.getMapping()); 192 } 193 else 194 { 195 parseChromosomeLocations(seq); 196 } 197 } 198 199 /** 200 * Parses and saves fields of an Ensembl-style description e.g. 201 * chromosome:GRCh38:17:45051610:45109016:1 202 * 203 * @param seq 204 */ parseChromosomeLocations(SequenceI seq)205 boolean parseChromosomeLocations(SequenceI seq) 206 { 207 String description = seq.getDescription(); 208 if (description == null) 209 { 210 return false; 211 } 212 String[] tokens = description.split(":"); 213 if (tokens.length == 6 && tokens[0].startsWith(CHROMOSOME)) 214 { 215 String ref = tokens[1]; 216 String chrom = tokens[2]; 217 try 218 { 219 int chStart = Integer.parseInt(tokens[3]); 220 int chEnd = Integer.parseInt(tokens[4]); 221 boolean forwardStrand = "1".equals(tokens[5]); 222 String species = ""; // not known here 223 int[] from = new int[] { seq.getStart(), seq.getEnd() }; 224 int[] to = new int[] { forwardStrand ? chStart : chEnd, 225 forwardStrand ? chEnd : chStart }; 226 MapList map = new MapList(from, to, 1, 1); 227 seq.setGeneLoci(species, ref, chrom, map); 228 return true; 229 } catch (NumberFormatException e) 230 { 231 System.err.println("Bad integers in description " + description); 232 } 233 } 234 return false; 235 } 236 237 /** 238 * Converts a query, which may contain one or more gene, transcript, or 239 * external (to Ensembl) identifiers, into a non-redundant list of gene 240 * identifiers. 241 * 242 * @param accessions 243 * @return 244 */ getGeneIds(String accessions)245 List<String> getGeneIds(String accessions) 246 { 247 List<String> geneIds = new ArrayList<>(); 248 249 for (String acc : accessions.split(getAccessionSeparator())) 250 { 251 /* 252 * First try lookup as an Ensembl (gene or transcript) identifier 253 */ 254 String geneId = new EnsemblLookup(getDomain()).getGeneId(acc); 255 if (geneId != null) 256 { 257 if (!geneIds.contains(geneId)) 258 { 259 geneIds.add(geneId); 260 } 261 } 262 else 263 { 264 /* 265 * if given a gene or other external name, lookup and fetch 266 * the corresponding gene for all model organisms 267 */ 268 List<String> ids = new EnsemblSymbol(getDomain(), getDbSource(), 269 getDbVersion()).getGeneIds(acc); 270 for (String id : ids) 271 { 272 if (!geneIds.contains(id)) 273 { 274 geneIds.add(id); 275 } 276 } 277 } 278 } 279 return geneIds; 280 } 281 282 /** 283 * Constructs all transcripts for the gene, as identified by "transcript" 284 * features whose Parent is the requested gene. The coding transcript 285 * sequences (i.e. with introns omitted) are added to the alignment. 286 * 287 * @param al 288 * @param accId 289 * @throws Exception 290 */ getTranscripts(AlignmentI al, String accId)291 protected void getTranscripts(AlignmentI al, String accId) 292 throws Exception 293 { 294 SequenceI gene = al.getSequenceAt(0); 295 List<SequenceFeature> transcriptFeatures = getTranscriptFeatures(accId, 296 gene); 297 298 for (SequenceFeature transcriptFeature : transcriptFeatures) 299 { 300 makeTranscript(transcriptFeature, al, gene); 301 } 302 303 clearGeneFeatures(gene); 304 } 305 306 /** 307 * Remove unwanted features (transcript, exon, CDS) from the gene sequence 308 * after we have used them to derive transcripts and transfer features 309 * 310 * @param gene 311 */ clearGeneFeatures(SequenceI gene)312 protected void clearGeneFeatures(SequenceI gene) 313 { 314 /* 315 * Note we include NMD_transcript_variant here because it behaves like 316 * 'transcript' in Ensembl, although strictly speaking it is not 317 * (it is a sub-type of sequence_variant) 318 */ 319 String[] soTerms = new String[] { 320 SequenceOntologyI.NMD_TRANSCRIPT_VARIANT, 321 SequenceOntologyI.TRANSCRIPT, SequenceOntologyI.EXON, 322 SequenceOntologyI.CDS }; 323 List<SequenceFeature> sfs = gene.getFeatures().getFeaturesByOntology( 324 soTerms); 325 for (SequenceFeature sf : sfs) 326 { 327 gene.deleteFeature(sf); 328 } 329 } 330 331 /** 332 * Constructs a spliced transcript sequence by finding 'exon' features for the 333 * given id (or failing that 'CDS'). Copies features on to the new sequence. 334 * 'Aligns' the new sequence against the gene sequence by padding with gaps, 335 * and adds it to the alignment. 336 * 337 * @param transcriptFeature 338 * @param al 339 * the alignment to which to add the new sequence 340 * @param gene 341 * the parent gene sequence, with features 342 * @return 343 */ makeTranscript(SequenceFeature transcriptFeature, AlignmentI al, SequenceI gene)344 SequenceI makeTranscript(SequenceFeature transcriptFeature, AlignmentI al, 345 SequenceI gene) 346 { 347 String accId = getTranscriptId(transcriptFeature); 348 if (accId == null) 349 { 350 return null; 351 } 352 353 /* 354 * NB we are mapping from gene sequence (not genome), so do not 355 * need to check for reverse strand (gene and transcript sequences 356 * are in forward sense) 357 */ 358 359 /* 360 * make a gene-length sequence filled with gaps 361 * we will fill in the bases for transcript regions 362 */ 363 char[] seqChars = new char[gene.getLength()]; 364 Arrays.fill(seqChars, al.getGapCharacter()); 365 366 /* 367 * look for exon features of the transcript, failing that for CDS 368 * (for example ENSG00000124610 has 1 CDS but no exon features) 369 */ 370 String parentId = accId; 371 List<SequenceFeature> splices = findFeatures(gene, 372 SequenceOntologyI.EXON, parentId); 373 if (splices.isEmpty()) 374 { 375 splices = findFeatures(gene, SequenceOntologyI.CDS, parentId); 376 } 377 SequenceFeatures.sortFeatures(splices, true); 378 379 int transcriptLength = 0; 380 final char[] geneChars = gene.getSequence(); 381 int offset = gene.getStart(); // to convert to 0-based positions 382 List<int[]> mappedFrom = new ArrayList<>(); 383 384 for (SequenceFeature sf : splices) 385 { 386 int start = sf.getBegin() - offset; 387 int end = sf.getEnd() - offset; 388 int spliceLength = end - start + 1; 389 System.arraycopy(geneChars, start, seqChars, start, spliceLength); 390 transcriptLength += spliceLength; 391 mappedFrom.add(new int[] { sf.getBegin(), sf.getEnd() }); 392 } 393 394 Sequence transcript = new Sequence(accId, seqChars, 1, 395 transcriptLength); 396 397 /* 398 * Ensembl has gene name as transcript Name 399 * EnsemblGenomes doesn't, but has a url-encoded description field 400 */ 401 String description = transcriptFeature.getDescription(); 402 if (description == null) 403 { 404 description = (String) transcriptFeature.getValue(DESCRIPTION); 405 } 406 if (description != null) 407 { 408 try 409 { 410 transcript.setDescription(URLDecoder.decode(description, "UTF-8")); 411 } catch (UnsupportedEncodingException e) 412 { 413 e.printStackTrace(); // as if 414 } 415 } 416 transcript.createDatasetSequence(); 417 418 al.addSequence(transcript); 419 420 /* 421 * transfer features to the new sequence; we use EnsemblCdna to do this, 422 * to filter out unwanted features types (see method retainFeature) 423 */ 424 List<int[]> mapTo = new ArrayList<>(); 425 mapTo.add(new int[] { 1, transcriptLength }); 426 MapList mapping = new MapList(mappedFrom, mapTo, 1, 1); 427 EnsemblCdna cdna = new EnsemblCdna(getDomain()); 428 cdna.transferFeatures(gene.getFeatures().getPositionalFeatures(), 429 transcript.getDatasetSequence(), mapping, parentId); 430 431 mapTranscriptToChromosome(transcript, gene, mapping); 432 433 /* 434 * fetch and save cross-references 435 */ 436 cdna.getCrossReferences(transcript); 437 438 /* 439 * and finally fetch the protein product and save as a cross-reference 440 */ 441 cdna.addProteinProduct(transcript); 442 443 return transcript; 444 } 445 446 /** 447 * If the gene has a mapping to chromosome coordinates, derive the transcript 448 * chromosome regions and save on the transcript sequence 449 * 450 * @param transcript 451 * @param gene 452 * @param mapping 453 * the mapping from gene to transcript positions 454 */ mapTranscriptToChromosome(SequenceI transcript, SequenceI gene, MapList mapping)455 protected void mapTranscriptToChromosome(SequenceI transcript, 456 SequenceI gene, MapList mapping) 457 { 458 GeneLociI loci = gene.getGeneLoci(); 459 if (loci == null) 460 { 461 return; 462 } 463 464 MapList geneMapping = loci.getMapping(); 465 466 List<int[]> exons = mapping.getFromRanges(); 467 List<int[]> transcriptLoci = new ArrayList<>(); 468 469 for (int[] exon : exons) 470 { 471 transcriptLoci.add(geneMapping.locateInTo(exon[0], exon[1])); 472 } 473 474 List<int[]> transcriptRange = Arrays.asList(new int[] { 475 transcript.getStart(), transcript.getEnd() }); 476 MapList mapList = new MapList(transcriptRange, transcriptLoci, 1, 1); 477 478 transcript.setGeneLoci(loci.getSpeciesId(), loci.getAssemblyId(), 479 loci.getChromosomeId(), mapList); 480 } 481 482 /** 483 * Returns the 'transcript_id' property of the sequence feature (or null) 484 * 485 * @param feature 486 * @return 487 */ getTranscriptId(SequenceFeature feature)488 protected String getTranscriptId(SequenceFeature feature) 489 { 490 return (String) feature.getValue(JSON_ID); 491 } 492 493 /** 494 * Returns a list of the transcript features on the sequence whose Parent is 495 * the gene for the accession id. 496 * <p> 497 * Transcript features are those of type "transcript", or any of its sub-types 498 * in the Sequence Ontology e.g. "mRNA", "processed_transcript". We also 499 * include "NMD_transcript_variant", because this type behaves like a 500 * transcript identifier in Ensembl, although strictly speaking it is not in 501 * the SO. 502 * 503 * @param accId 504 * @param geneSequence 505 * @return 506 */ getTranscriptFeatures(String accId, SequenceI geneSequence)507 protected List<SequenceFeature> getTranscriptFeatures(String accId, 508 SequenceI geneSequence) 509 { 510 List<SequenceFeature> transcriptFeatures = new ArrayList<>(); 511 512 String parentIdentifier = accId; 513 514 List<SequenceFeature> sfs = geneSequence.getFeatures() 515 .getFeaturesByOntology(SequenceOntologyI.TRANSCRIPT); 516 sfs.addAll(geneSequence.getFeatures().getPositionalFeatures( 517 SequenceOntologyI.NMD_TRANSCRIPT_VARIANT)); 518 519 for (SequenceFeature sf : sfs) 520 { 521 String parent = (String) sf.getValue(PARENT); 522 if (parentIdentifier.equalsIgnoreCase(parent)) 523 { 524 transcriptFeatures.add(sf); 525 } 526 } 527 528 return transcriptFeatures; 529 } 530 531 @Override getDescription()532 public String getDescription() 533 { 534 return "Fetches all transcripts and variant features for a gene or transcript"; 535 } 536 537 /** 538 * Default test query is a gene id (can also enter a transcript id) 539 */ 540 @Override getTestQuery()541 public String getTestQuery() 542 { 543 return "ENSG00000157764"; // BRAF, 5 transcripts, reverse strand 544 // ENSG00000090266 // NDUFB2, 15 transcripts, forward strand 545 // ENSG00000101812 // H2BFM histone, 3 transcripts, forward strand 546 // ENSG00000123569 // H2BFWT histone, 2 transcripts, reverse strand 547 } 548 549 /** 550 * Answers a list of sequence features (if any) whose type is 'gene' (or a 551 * subtype of gene in the Sequence Ontology), and whose ID is the accession we 552 * are retrieving 553 */ 554 @Override getIdentifyingFeatures(SequenceI seq, String accId)555 protected List<SequenceFeature> getIdentifyingFeatures(SequenceI seq, 556 String accId) 557 { 558 List<SequenceFeature> result = new ArrayList<>(); 559 List<SequenceFeature> sfs = seq.getFeatures() 560 .getFeaturesByOntology(SequenceOntologyI.GENE); 561 for (SequenceFeature sf : sfs) 562 { 563 String id = (String) sf.getValue(JSON_ID); 564 if (accId.equalsIgnoreCase(id)) 565 { 566 result.add(sf); 567 } 568 } 569 return result; 570 } 571 572 /** 573 * Answers true unless feature type is 'gene', or 'transcript' with a parent 574 * which is a different gene. We need the gene features to identify the range, 575 * but it is redundant information on the gene sequence. Checking the parent 576 * allows us to drop transcript features which belong to different 577 * (overlapping) genes. 578 */ 579 @Override retainFeature(SequenceFeature sf, String accessionId)580 protected boolean retainFeature(SequenceFeature sf, String accessionId) 581 { 582 SequenceOntologyI so = SequenceOntologyFactory.getInstance(); 583 String type = sf.getType(); 584 if (so.isA(type, SequenceOntologyI.GENE)) 585 { 586 return false; 587 } 588 if (isTranscript(type)) 589 { 590 String parent = (String) sf.getValue(PARENT); 591 if (!accessionId.equalsIgnoreCase(parent)) 592 { 593 return false; 594 } 595 } 596 return true; 597 } 598 599 /** 600 * Override to do nothing as Ensembl doesn't return a protein sequence for a 601 * gene identifier 602 */ 603 @Override addProteinProduct(SequenceI querySeq)604 protected void addProteinProduct(SequenceI querySeq) 605 { 606 } 607 608 @Override getAccessionValidator()609 public Regex getAccessionValidator() 610 { 611 return ACCESSION_REGEX; 612 } 613 614 /** 615 * Returns a descriptor for suitable feature display settings with 616 * <ul> 617 * <li>only exon or sequence_variant features (or their subtypes in the 618 * Sequence Ontology) visible</li> 619 * <li>variant features coloured red</li> 620 * <li>exon features coloured by label (exon name)</li> 621 * <li>variants displayed above (on top of) exons</li> 622 * </ul> 623 */ 624 @Override getFeatureColourScheme()625 public FeatureSettingsModelI getFeatureColourScheme() 626 { 627 return new FeatureSettingsAdapter() 628 { 629 SequenceOntologyI so = SequenceOntologyFactory.getInstance(); 630 631 @Override 632 public boolean isFeatureHidden(String type) 633 { 634 return (!so.isA(type, SequenceOntologyI.EXON) 635 && !so.isA(type, SequenceOntologyI.SEQUENCE_VARIANT)); 636 } 637 638 @Override 639 public FeatureColourI getFeatureColour(String type) 640 { 641 if (so.isA(type, SequenceOntologyI.EXON)) 642 { 643 return new FeatureColour() 644 { 645 @Override 646 public boolean isColourByLabel() 647 { 648 return true; 649 } 650 }; 651 } 652 if (so.isA(type, SequenceOntologyI.SEQUENCE_VARIANT)) 653 { 654 return new FeatureColour() 655 { 656 657 @Override 658 public Color getColour() 659 { 660 return Color.RED; 661 } 662 }; 663 } 664 return null; 665 } 666 667 /** 668 * order to render sequence_variant after exon after the rest 669 */ 670 @Override 671 public int compare(String feature1, String feature2) 672 { 673 if (so.isA(feature1, SequenceOntologyI.SEQUENCE_VARIANT)) 674 { 675 return +1; 676 } 677 if (so.isA(feature2, SequenceOntologyI.SEQUENCE_VARIANT)) 678 { 679 return -1; 680 } 681 if (so.isA(feature1, SequenceOntologyI.EXON)) 682 { 683 return +1; 684 } 685 if (so.isA(feature2, SequenceOntologyI.EXON)) 686 { 687 return -1; 688 } 689 return 0; 690 } 691 }; 692 } 693 694 } 695