/* ChadoCanonicalGene.java * * created: 2006 * * This file is part of Artemis * * Copyright (C) 2006 Genome Research Limited * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version 2 * of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * * $Header: //tmp/pathsoft/artemis/uk/ac/sanger/artemis/io/ChadoCanonicalGene.java,v 1.34 2009-08-11 08:59:46 tjc Exp $ */ package uk.ac.sanger.artemis.io; import uk.ac.sanger.artemis.components.genebuilder.GeneUtils; import uk.ac.sanger.artemis.util.DatabaseDocument; import uk.ac.sanger.artemis.util.ReadOnlyException; import uk.ac.sanger.artemis.util.StringVector; import java.util.Collections; import java.util.Comparator; import java.util.Iterator; import java.util.Vector; import java.util.Hashtable; import java.util.Enumeration; import java.util.List; import java.util.Set; import java.util.HashSet; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.gmod.schema.sequence.FeatureLoc; /** * Used by GFFStreamFeature to represent the chado canonical gene. * Contains gene, transcript, exons and proteins. **/ public class ChadoCanonicalGene { private Feature gene; // part_of gene private List transcripts = new Vector(); // part_of transcripts private Hashtable> splicedFeatures = new Hashtable>(); // derives_from transript private Hashtable proteins = new Hashtable(); // utr features private Hashtable> three_prime_utr = new Hashtable>(); private Hashtable> five_prime_utr = new Hashtable>(); // other child features of transcript private Hashtable> other_features = new Hashtable>(); // srcfeature private int srcfeature_id; // srcfeature length private int seqlen; /** * Get the gene feaure object. * @return */ public Feature getGene() { return gene; } /** * Set the gene feature object. * @param gene */ public void setGene(Feature gene) { this.gene = gene; } public String getGeneUniqueName() { try { return getQualifier(getGene(), "ID"); } catch(InvalidRelationException e) { return null; } } /** * Add a transcript to the model * @param transcript */ public void addTranscript(Feature transcript) { transcripts.add(transcript); } /** * Delete a transcript and child features. * @param transcript_name */ public void deleteTranscript(String transcript_name) { for(int i=0; i children) { updateNames(splicedFeatures,oldName,newName); updateNames(proteins,oldName,newName); updateNames(three_prime_utr,oldName,newName); updateNames(five_prime_utr,oldName,newName); updateNames(other_features,oldName,newName); if(children != null) GeneUtils.fixParentQualifier(oldName, newName, children); } /** * Utility for changing the key used in a Hashtable * @param hash * @param oldName * @param newName */ private static void updateNames(final Hashtable hash, final String oldName, final String newName) { Object features = hash.get(oldName); if(features != null) { hash.remove(oldName); hash.put(newName, features); } } /** * Delete features. * @param embl_feature */ public void deleteFeature(final Feature embl_feature) { try { final String name = getQualifier(embl_feature, "ID"); Object feature = getSplicedFeatures(name); if(feature != null) { String transcript_name = getQualifier((Feature) feature, "Parent"); splicedFeatures.remove(transcript_name); return; } final Enumeration enum_protein = proteins.keys(); while(enum_protein.hasMoreElements()) { final String transcriptName = (String)enum_protein.nextElement(); Feature protein = (Feature)proteins.get(transcriptName); if(getQualifier(protein, "ID").equals(name)) { proteins.remove(transcriptName); return; } } feature = getFeatureFromHash(name, three_prime_utr); if(feature != null) { String transcript_name = getQualifier((Feature) feature, "Parent"); List utr = get3UtrOfTranscript(transcript_name); utr.remove(feature); return; } feature = getFeatureFromHash(name, five_prime_utr); if(feature != null) { String transcript_name = getQualifier((Feature) feature, "Parent"); List utr = get5UtrOfTranscript(transcript_name); utr.remove(feature); return; } feature = getFeatureFromHash(name, other_features); if(feature != null) { String transcript_name = getQualifier((Feature) feature, "Parent"); List others = getOtherFeaturesOfTranscript(transcript_name); others.remove(feature); return; } deleteTranscript(name); } catch(InvalidRelationException e1) { e1.printStackTrace(); } } /** * Get all child members of a feature * @param embl_feature * @return */ public Set getChildren(Feature embl_feature) { Set children = new HashSet(); try { String name = getQualifier(embl_feature, "ID"); String gene_name = getQualifier(getGene(), "ID"); if(name.equals(gene_name)) { List transcripts = getTranscripts(); for(int i=0; i pep_enum = proteins.elements(); while(pep_enum.hasMoreElements()) { Feature child = pep_enum.nextElement(); String parent = getQualifier(child, "Derives_from"); if(parent != null && parent.equals(name)) children.add(child); } return children; } catch(InvalidRelationException e) { e.printStackTrace(); } return null; } /** * Search in a Hashtable for child Features with a * matching parent ID. Child features are added to the Set * that is passed into this method. * @param hash Hashtable to search for children in * @param parent_id uniquname to look for * @param children collection to add child features to * @throws InvalidRelationException */ private void searchForChildren(Hashtable> hash, String parent_id, Set children) throws InvalidRelationException { Enumeration> feature_enum = hash.elements(); String parent; while(feature_enum.hasMoreElements()) { List child_list = feature_enum.nextElement(); for(int i=0; i v_spliced; if(splicedFeatures.containsKey(transcript_name)) v_spliced = (Vector)splicedFeatures.get(transcript_name); else v_spliced = new Vector(); v_spliced.add(spliced); splicedFeatures.put(transcript_name, v_spliced); } public void correctSpliceSiteAssignments() { Enumeration enumSplicedFeatures = splicedFeatures.keys(); while(enumSplicedFeatures.hasMoreElements()) { String transcriptId = enumSplicedFeatures.nextElement(); Vector v_spliced = (Vector)splicedFeatures.get(transcriptId); Set splicedTypes = getSpliceTypes(transcriptId); Iterator it = splicedTypes.iterator(); while(it.hasNext()) { String type = it.next(); if(!type.equals(DatabaseDocument.EXONMODEL) && !type.equals("pseudogenic_exon") && !type.equals("exon")) { List splicedFeatures = getSpliceSitesOfTranscript(transcriptId, type); if(splicedFeatures.size() == 1) { Feature f = (Feature)splicedFeatures.get(0); addOtherFeatures(transcriptId, f); v_spliced.remove(f); try { f.removeQualifierByName("feature_relationship_rank"); } catch(ReadOnlyException e){} catch(EntryInformationException e){} } } } splicedFeatures.put(transcriptId, v_spliced); } } /** * Add protein feature to the chado gene model. * @param transcript_name * @param protein * @throws InvalidRelationException */ public void addProtein(final String transcript_name, final Feature protein) { proteins.put(transcript_name, protein); } /** * Add 3'UTR to chado gene model. * @param transcript_name * @param utr * @throws InvalidRelationException */ public void add3PrimeUtr(final String transcript_name, final Feature utr) { final List utr_list; if(three_prime_utr.containsKey(transcript_name)) utr_list = three_prime_utr.get(transcript_name); else utr_list = new Vector(); utr_list.add(utr); three_prime_utr.put(transcript_name, utr_list); } /** * Add 5'UTR to chado gene model. * @param transcript_name * @param utr * @throws InvalidRelationException */ public void add5PrimeUtr(final String transcript_name, final Feature utr) { final List utr_list; if(five_prime_utr.containsKey(transcript_name)) utr_list = (Vector)five_prime_utr.get(transcript_name); else utr_list = new Vector(); utr_list.add(utr); five_prime_utr.put(transcript_name, utr_list); } /** * Add other child features of a transcript to the chado * gene model. * @param transcript_name * @param other_feature */ public void addOtherFeatures(final String transcript_name, final Feature other_feature) { final List v_other_features; if(other_features.containsKey(transcript_name)) v_other_features = (Vector)other_features.get(transcript_name); else v_other_features = new Vector(); v_other_features.add(other_feature); other_features.put(transcript_name, v_other_features); } /** * Check if this gene model contains a transcript with an ID equal to * any of the names in the StringVector. If it does find * it returns the transcript feature, otherwise it returns null. * @param names * @return */ public Feature containsTranscript(final StringVector names) { for(int i=0; i getSpliceSitesOfTranscript(final String transcript_name, final String type) { if(splicedFeatures.containsKey(transcript_name)) { List splicedFeaturesOfTranscript = splicedFeatures.get(transcript_name); List results = new Vector(); for(int i=0; i getSpliceTypes(final String transcript_name) { if(splicedFeatures.containsKey(transcript_name)) { List splicedFeaturesOfTranscript = splicedFeatures.get(transcript_name); Set splicedTypes = new HashSet(); for(int i=0; iList. * @param transcript_name * @return */ public List getSplicedFeaturesOfTranscript(final String transcript_name) { if(splicedFeatures.containsKey(transcript_name)) { return splicedFeatures.get(transcript_name); } return null; } /** * Return the transcript from the name of a constituent feature * @param constituent feature name * @return transcript */ public Feature getTranscriptFeatureFromName(final String name) { String transcriptName = getTranscriptFromName(name); if(transcriptName == null) return null; try { for (int i = 0; i < transcripts.size(); i++) { Feature feature = (Feature) transcripts.get(i); if (getQualifier(feature, "ID").equals(transcriptName)) return feature; } } catch (InvalidRelationException ire){} return null; } /** * Return the transcript from the name of a constituent feature * @param constituent feature name * @return transcript name */ public String getTranscriptFromName(final String name) { // check transcript StringVector sv = new StringVector(); sv.add(name); Feature feature = containsTranscript(sv); if(feature != null) return name; // check exons List transcriptNames = getTranscriptNames(); feature = getSplicedFeatures(name); if(feature != null) { for(int i=0; i splicedSegments = getSplicedFeaturesOfTranscript(transcriptName); if(splicedSegments != null) { for(int j=0; j transcripts = getTranscripts(); for(int i=0;i children = getChildren(transcript); Iterator it = children.iterator(); while(it.hasNext()) { Feature f = it.next(); if(name.equals(GeneUtils.getUniqueName(f))) return GeneUtils.getUniqueName(transcript); } } return null; } /** * Return the protein feature of a transcipt. * @param transcript_name * @return */ public Feature getProteinOfTranscript(final String transcript_name) { if(proteins.containsKey(transcript_name)) return (Feature)proteins.get(transcript_name);; return null; } /** * Return the 3'UTR features of a transcriot as a List. * @param transcript_name * @return */ public List get3UtrOfTranscript(final String transcript_name) { if(three_prime_utr.containsKey(transcript_name)) return (List)three_prime_utr.get(transcript_name); return null; } /** * Return the 5'UTR features of a transcriot as a List. * @param transcript_name * @return */ public List get5UtrOfTranscript(final String transcript_name) { if(five_prime_utr.containsKey(transcript_name)) return (List)five_prime_utr.get(transcript_name); return null; } /** * Utility to determine if this is the first or only UTR, so that * partial qualifiers can be added to the correct UTR feature. * @param utrName * @param isFwd * @return */ public boolean isFirstUtr(final String utrName, final boolean isFwd) { try { Feature this5Utr = getFeatureFromHash(utrName, five_prime_utr); if (this5Utr != null) { String transcript_name = getQualifier(this5Utr, "Parent"); List utrs = get5UtrOfTranscript(transcript_name); if (utrs.size() == 1) return true; for (Feature utr : utrs) { if (isFwd && utr.getFirstBase() < this5Utr.getFirstBase()) return false; else if (!isFwd && utr.getLastBase() > this5Utr.getLastBase()) return false; } return true; } Feature this3Utr = getFeatureFromHash(utrName, three_prime_utr); if (this3Utr != null) { String transcript_name = getQualifier(this3Utr, "Parent"); List utrs = get3UtrOfTranscript(transcript_name); if (utrs.size() == 1) return true; for (Feature utr : utrs) { if (!isFwd && utr.getFirstBase() < this3Utr.getFirstBase()) return false; else if (isFwd && utr.getLastBase() > this3Utr.getLastBase()) return false; } return true; } } catch(InvalidRelationException ire){} return false; } /** * Return the other child features of a transcriot as a List. * @param transcript_name * @return */ public List getOtherFeaturesOfTranscript(final String transcript_name) { if(other_features.containsKey(transcript_name)) return other_features.get(transcript_name); return null; } /** * Get a list of trancripts. * @return */ public List getTranscripts() { return transcripts; } /** * Get a list of trancripts. * @return */ private List getTranscriptNames() { List names = new Vector(); for(int i=0; i enum_pp = proteins.elements(); while(enum_pp.hasMoreElements()) { final Feature pp = enum_pp.nextElement(); if( getQualifier(pp, "ID").equals(name) ) return false; } if( getQualifier(getGene(), "ID").equals(name) ) return false; } catch(InvalidRelationException e) { e.printStackTrace(); } return true; } /** * Test if the name is a transcript in this gene model. * @param feature_id * @return true if a transcript */ public boolean isTranscript(final String feature_id) { try { for(int i=0; i splicedFeatures = new Vector(); List transcripts = getTranscripts(); try { for(int i = 0; i < transcripts.size(); i++) { Feature transcript = (Feature) transcripts.get(i); String transcript_id = getQualifier(transcript, "ID"); List splicedSites = getSplicedFeaturesOfTranscript(transcript_id); if(splicedSites != null) splicedFeatures.addAll(splicedSites); } if(splicedFeatures == null) return false; for(int i=0; i -1) { try { transcript_number = Integer.parseInt(transcript_id.substring(index+1)); } catch(NumberFormatException nfe) { transcript_number = -1; } } if(transcript_number < 1) { for(transcript_number = 0; transcript_number <= transcripts.size(); transcript_number++) { Feature transcript = (Feature) transcripts.get(transcript_number); if(transcript_id.equals(getQualifier(transcript, "ID"))) break; } } if(transcript_number == 0) name = name + ":exon:"; else name = name + "." + transcript_number + ":exon:"; int auto = 1; while( isSplicedFeatures(name + auto) && auto < 50) auto++; return name + auto; } catch(InvalidRelationException e) { e.printStackTrace(); } return null; } /** * Generate new names for peptide features for this gene model * @param transcript_id * @return */ public String autoGeneratePepName(final String transcript_id) { try { int index = transcript_id.lastIndexOf('.'); if(index == -1) index = transcript_id.lastIndexOf(':'); int transcript_number = -1; if(index > -1) { try { transcript_number = Integer.parseInt(transcript_id.substring(index+1)); } catch(NumberFormatException nfe) { transcript_number = -1; } } if(transcript_number < 1) { for(transcript_number = 1; transcript_number <= transcripts.size(); transcript_number++) { Feature transcript = (Feature) transcripts.get(transcript_number - 1); if(transcript_id.equals(getQualifier(transcript, "ID"))) break; } } String name = (String)getGene().getQualifierByName("ID").getValues().get(0); if(isUniqueName(name+ "." + transcript_number + ":pep")) return name+ "." + transcript_number + ":pep"; else return name + "." + transcript_number + "a:pep"; } catch(InvalidRelationException e) { e.printStackTrace(); } return null; } /** * Generate new names for generic region features for this gene model * @param transcript_id * @return */ public String autoGenerateFeatureName(final String transcript_id, final String keyName) { String featureName = ""; try { featureName = (String)getGene().getQualifierByName("ID").getValues().get(0); } catch(InvalidRelationException e){} final Pattern pattern = Pattern.compile("\\d+$"); final Matcher matcher = pattern.matcher(transcript_id); if(matcher.find()) featureName = featureName+"."+matcher.group()+":"+keyName; else featureName = featureName+":"+keyName; if(!isUniqueName(featureName)) { int num = 1; while(!isUniqueName(featureName + ":" + num) && num < 100) num++; featureName = featureName + ":" + num; } return featureName; } /** * Search for the feature with a particular uniquename * @param name uniquename * @return */ public Object getFeatureFromId(final String name) { Object feature = null; // check gene try { final String uniquename = getQualifier(gene, "ID"); if(uniquename.equals(name)) return gene; } catch(InvalidRelationException e) { e.printStackTrace(); } // check transcript StringVector sv = new StringVector(); sv.add(name); feature = containsTranscript(sv); if(feature != null) return feature; // check exons feature = getSplicedFeatures(name); if(feature != null) return feature; feature = getProtein(name); if(feature != null) return feature; try { feature = getFeatureFromHash(name, three_prime_utr); if(feature != null) return feature; feature = getFeatureFromHash(name, five_prime_utr); if(feature != null) return feature; feature = getFeatureFromHash(name, other_features); } catch(InvalidRelationException e) { e.printStackTrace(); } return feature; } /** * Routine to look for a exon with a particular * uniquename * @param name * @return */ private Feature getSplicedFeatures(final String name) { Enumeration> enum_exons = splicedFeatures.elements(); try { while(enum_exons.hasMoreElements()) { List exons = enum_exons.nextElement(); for(int i=0; i enum_proteins = proteins.elements(); try { while(enum_proteins.hasMoreElements()) { Feature protein = enum_proteins.nextElement(); if(getQualifier(protein, "ID").equals(id)) return protein; } } catch(InvalidRelationException e) { e.printStackTrace(); } return null; } /** * Search for a feature uniquename * @param id * @param UTR * @return * @throws InvalidRelationException */ private Feature getFeatureFromHash (final String id, final Hashtable> UTR) throws InvalidRelationException { Enumeration> enum_utr = UTR.elements(); while(enum_utr.hasMoreElements()) { List utrs = enum_utr.nextElement(); for(int i=0; i> getSplicedFeatures() { return splicedFeatures; } /** * Get the nucleotide location for a featureloc in amino acid * coordinates. * @param proteinFeature * @param featureLocToProtein * @return * @throws LocationParseException */ public Location getNucLocation(final Feature proteinFeature, final FeatureLoc featureLocToProtein) throws LocationParseException { String transcriptName = getTranscriptFromName( GeneUtils.getUniqueName(proteinFeature)); List spliced = getSplicedFeaturesOfTranscript(transcriptName); if(spliced == null) return null; RangeVector ranges = new RangeVector(); for(int i=0; i len) fmax = len; if(ranges.size()>1) { Collections.sort(ranges, new RangeComparator()); for(int i=0;i range1.getEnd()) fmin += range2.getStart()-range1.getEnd(); if(fmax > range1.getEnd()) fmax += range2.getStart()-range1.getEnd(); } } Location location; if(proteinFeature.getLocation().isComplement()) location = new Location("complement("+fmin+".."+fmax+")"); else location = new Location(fmin+".."+fmax); return location; } class RangeComparator implements Comparator { public int compare(Range o1, Range o2) { int start1 = o1.getStart(); int start2 = o2.getStart(); return start1-start2; } } }