1 /* 2 * Jalview - A Sequence Alignment Editor and Viewer (2.11.1.4) 3 * Copyright (C) 2021 The Jalview Authors 4 * 5 * This file is part of Jalview. 6 * 7 * Jalview is free software: you can redistribute it and/or 8 * modify it under the terms of the GNU General Public License 9 * as published by the Free Software Foundation, either version 3 10 * of the License, or (at your option) any later version. 11 * 12 * Jalview is distributed in the hope that it will be useful, but 13 * WITHOUT ANY WARRANTY; without even the implied warranty 14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR 15 * PURPOSE. See the GNU General Public License for more details. 16 * 17 * You should have received a copy of the GNU General Public License 18 * along with Jalview. If not, see <http://www.gnu.org/licenses/>. 19 * The Jalview Authors are detailed in the 'AUTHORS' file. 20 */ 21 package jalview.io.gff; 22 23 import jalview.datamodel.AlignedCodonFrame; 24 import jalview.datamodel.AlignmentI; 25 import jalview.datamodel.MappingType; 26 import jalview.datamodel.SequenceFeature; 27 import jalview.datamodel.SequenceI; 28 import jalview.util.MapList; 29 30 import java.io.IOException; 31 import java.util.List; 32 import java.util.Map; 33 34 /** 35 * A handler to parse GFF in the format generated by the exonerate tool 36 */ 37 public class ExonerateHelper extends Gff2Helper 38 { 39 private static final String SIMILARITY = "similarity"; 40 41 private static final String GENOME2GENOME = "genome2genome"; 42 43 private static final String CDNA2GENOME = "cdna2genome"; 44 45 private static final String CODING2GENOME = "coding2genome"; 46 47 private static final String CODING2CODING = "coding2coding"; 48 49 private static final String PROTEIN2GENOME = "protein2genome"; 50 51 private static final String PROTEIN2DNA = "protein2dna"; 52 53 private static final String ALIGN = "Align"; 54 55 private static final String QUERY = "Query"; 56 57 private static final String TARGET = "Target"; 58 59 /** 60 * Process one GFF feature line (as modelled by SequenceFeature) 61 * 62 * @param seq 63 * the sequence with which this feature is associated 64 * @param gffColumns 65 * the sequence feature with ATTRIBUTES property containing any 66 * additional attributes 67 * @param align 68 * the alignment we are adding GFF to 69 * @param newseqs 70 * any new sequences referenced by the GFF 71 * @param relaxedIdMatching 72 * if true, match word tokens in sequence names 73 * @return true if the sequence feature should be added to the sequence, else 74 * false (i.e. it has been processed in another way e.g. to generate a 75 * mapping) 76 */ 77 @Override processGff(SequenceI seq, String[] gffColumns, AlignmentI align, List<SequenceI> newseqs, boolean relaxedIdMatching)78 public SequenceFeature processGff(SequenceI seq, String[] gffColumns, 79 AlignmentI align, List<SequenceI> newseqs, 80 boolean relaxedIdMatching) 81 { 82 String attr = gffColumns[ATTRIBUTES_COL]; 83 Map<String, List<String>> set = parseNameValuePairs(attr); 84 85 try 86 { 87 processGffSimilarity(set, seq, gffColumns, align, newseqs, 88 relaxedIdMatching); 89 } catch (IOException ivfe) 90 { 91 System.err.println(ivfe); 92 } 93 94 /* 95 * return null to indicate we don't want to add a sequence feature for 96 * similarity (only process it to create mappings) 97 */ 98 return null; 99 } 100 101 /** 102 * Processes the 'Query' (or 'Target') and 'Align' properties associated with 103 * an exonerate GFF similarity feature; these properties define the mapping of 104 * the annotated range to a related sequence. 105 * 106 * @param set 107 * parsed GFF column 9 key/value(s) 108 * @param seq 109 * the sequence the GFF feature is on 110 * @param gff 111 * the GFF column data 112 * @param align 113 * the alignment the sequence belongs to, where any new mappings 114 * should be added 115 * @param newseqs 116 * a list of new 'virtual sequences' generated while parsing GFF 117 * @param relaxedIdMatching 118 * if true allow fuzzy search for a matching target sequence 119 * @throws IOException 120 */ processGffSimilarity(Map<String, List<String>> set, SequenceI seq, String[] gff, AlignmentI align, List<SequenceI> newseqs, boolean relaxedIdMatching)121 protected void processGffSimilarity(Map<String, List<String>> set, 122 SequenceI seq, String[] gff, AlignmentI align, 123 List<SequenceI> newseqs, boolean relaxedIdMatching) 124 throws IOException 125 { 126 /* 127 * exonerate may be run with 128 * --showquerygff - outputs 'features on the query' e.g. (protein2genome) 129 * Target <dnaseqid> ; Align proteinStartPos dnaStartPos proteinCount 130 * --showtargetgff - outputs 'features on the target' e.g. (protein2genome) 131 * Query <proteinseqid> ; Align dnaStartPos proteinStartPos nucleotideCount 132 * where the Align spec may repeat 133 */ 134 // TODO handle coding2coding and similar as well 135 boolean featureIsOnTarget = true; 136 List<String> mapTo = set.get(QUERY); 137 if (mapTo == null) 138 { 139 mapTo = set.get(TARGET); 140 featureIsOnTarget = false; 141 } 142 MappingType type = getMappingType(gff[SOURCE_COL]); 143 144 if (type == null) 145 { 146 throw new IOException("Sorry, I don't handle " + gff[SOURCE_COL]); 147 } 148 149 if (mapTo == null || mapTo.size() != 1) 150 { 151 throw new IOException( 152 "Expecting exactly one sequence in Query or Target field (got " 153 + mapTo + ")"); 154 } 155 156 /* 157 * locate the mapped sequence in the alignment or 'new' (GFF file) sequences; 158 */ 159 SequenceI mappedSequence = findSequence(mapTo.get(0), align, newseqs, 160 relaxedIdMatching); 161 162 /* 163 * If mapping is from protein to dna, we store it as dna to protein instead 164 */ 165 SequenceI mapFromSequence = seq; 166 SequenceI mapToSequence = mappedSequence; 167 if ((type == MappingType.NucleotideToPeptide && featureIsOnTarget) 168 || (type == MappingType.PeptideToNucleotide 169 && !featureIsOnTarget)) 170 { 171 mapFromSequence = mappedSequence; 172 mapToSequence = seq; 173 } 174 175 /* 176 * Process the Align maps and create mappings. 177 * These may be cdna-genome, cdna-protein, genome-protein. 178 * The mapped sequences may or may not be in the alignment 179 * (they may be included later in the GFF file). 180 */ 181 182 /* 183 * get any existing mapping for these sequences (or start one), 184 * and add this mapped range 185 */ 186 AlignedCodonFrame acf = getMapping(align, mapFromSequence, 187 mapToSequence); 188 189 /* 190 * exonerate GFF has the strand of the target in column 7 191 * (differs from GFF3 which has it in the Target descriptor) 192 */ 193 String strand = gff[STRAND_COL]; 194 boolean forwardStrand = true; 195 if ("-".equals(strand)) 196 { 197 forwardStrand = false; 198 } 199 else if (!"+".equals(strand)) 200 { 201 System.err.println("Strand must be specified for alignment"); 202 return; 203 } 204 205 List<String> alignedRegions = set.get(ALIGN); 206 for (String region : alignedRegions) 207 { 208 MapList mapping = buildMapping(region, type, forwardStrand, 209 featureIsOnTarget, gff); 210 211 if (mapping == null) 212 { 213 continue; 214 } 215 216 acf.addMap(mapFromSequence, mapToSequence, mapping); 217 } 218 align.addCodonFrame(acf); 219 } 220 221 /** 222 * Construct the mapping 223 * 224 * @param region 225 * @param type 226 * @param forwardStrand 227 * @param featureIsOnTarget 228 * @param gff 229 * @return 230 */ buildMapping(String region, MappingType type, boolean forwardStrand, boolean featureIsOnTarget, String[] gff)231 protected MapList buildMapping(String region, MappingType type, 232 boolean forwardStrand, boolean featureIsOnTarget, String[] gff) 233 { 234 /* 235 * process one "fromStart toStart fromCount" descriptor 236 */ 237 String[] tokens = region.split(" "); 238 if (tokens.length != 3) 239 { 240 System.err.println("Malformed Align descriptor: " + region); 241 return null; 242 } 243 244 /* 245 * get start/end of from/to mappings 246 * if feature is on the target sequence we have to invert the sense 247 */ 248 int alignFromStart; 249 int alignToStart; 250 int alignCount; 251 try 252 { 253 alignFromStart = Integer.parseInt(tokens[0]); 254 alignToStart = Integer.parseInt(tokens[1]); 255 alignCount = Integer.parseInt(tokens[2]); 256 } catch (NumberFormatException nfe) 257 { 258 System.err.println(nfe.toString()); 259 return null; 260 } 261 262 int fromStart; 263 int fromEnd; 264 int toStart; 265 int toEnd; 266 267 if (featureIsOnTarget) 268 { 269 fromStart = alignToStart; 270 toStart = alignFromStart; 271 toEnd = forwardStrand ? toStart + alignCount - 1 272 : toStart - (alignCount - 1); 273 int toLength = Math.abs(toEnd - toStart) + 1; 274 int fromLength = toLength * type.getFromRatio() / type.getToRatio(); 275 fromEnd = fromStart + fromLength - 1; 276 } 277 else 278 { 279 // we use the 'Align' values here not the feature start/end 280 // not clear why they may differ but it seems they can 281 fromStart = alignFromStart; 282 fromEnd = alignFromStart + alignCount - 1; 283 int fromLength = fromEnd - fromStart + 1; 284 int toLength = fromLength * type.getToRatio() / type.getFromRatio(); 285 toStart = alignToStart; 286 if (forwardStrand) 287 { 288 toEnd = toStart + toLength - 1; 289 } 290 else 291 { 292 toEnd = toStart - (toLength - 1); 293 } 294 } 295 296 MapList codonmapping = constructMappingFromAlign(fromStart, fromEnd, 297 toStart, toEnd, type); 298 return codonmapping; 299 } 300 301 /** 302 * Returns a MappingType depending on the exonerate 'model' value. 303 * 304 * @param model 305 * @return 306 */ getMappingType(String model)307 protected static MappingType getMappingType(String model) 308 { 309 MappingType result = null; 310 311 if (model.contains(PROTEIN2DNA) || model.contains(PROTEIN2GENOME)) 312 { 313 result = MappingType.PeptideToNucleotide; 314 } 315 else if (model.contains(CODING2CODING) || model.contains(CODING2GENOME) 316 || model.contains(CDNA2GENOME) || model.contains(GENOME2GENOME)) 317 { 318 result = MappingType.NucleotideToNucleotide; 319 } 320 return result; 321 } 322 323 /** 324 * Tests whether the GFF data looks like it was generated by exonerate, and is 325 * a format we are willing to handle 326 * 327 * @param columns 328 * @return 329 */ recognises(String[] columns)330 public static boolean recognises(String[] columns) 331 { 332 if (!SIMILARITY.equalsIgnoreCase(columns[TYPE_COL])) 333 { 334 return false; 335 } 336 337 /* 338 * inspect alignment model 339 */ 340 String model = columns[SOURCE_COL]; 341 // e.g. exonerate:protein2genome:local 342 if (model != null) 343 { 344 String mdl = model.toLowerCase(); 345 if (mdl.contains(PROTEIN2DNA) || mdl.contains(PROTEIN2GENOME) 346 || mdl.contains(CODING2CODING) || mdl.contains(CODING2GENOME) 347 || mdl.contains(CDNA2GENOME) || mdl.contains(GENOME2GENOME)) 348 { 349 return true; 350 } 351 } 352 System.err.println("Sorry, I don't handle exonerate model " + model); 353 return false; 354 } 355 356 /** 357 * An override to set feature group to "exonerate" instead of the default GFF 358 * source value (column 2) 359 */ 360 @Override buildSequenceFeature(String[] gff, Map<String, List<String>> set)361 protected SequenceFeature buildSequenceFeature(String[] gff, 362 Map<String, List<String>> set) 363 { 364 SequenceFeature sf = super.buildSequenceFeature(gff, TYPE_COL, 365 "exonerate", set); 366 367 return sf; 368 } 369 370 } 371