1 /* 2 * Jalview - A Sequence Alignment Editor and Viewer (2.11.1.4) 3 * Copyright (C) 2021 The Jalview Authors 4 * 5 * This file is part of Jalview. 6 * 7 * Jalview is free software: you can redistribute it and/or 8 * modify it under the terms of the GNU General Public License 9 * as published by the Free Software Foundation, either version 3 10 * of the License, or (at your option) any later version. 11 * 12 * Jalview is distributed in the hope that it will be useful, but 13 * WITHOUT ANY WARRANTY; without even the implied warranty 14 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR 15 * PURPOSE. See the GNU General Public License for more details. 16 * 17 * You should have received a copy of the GNU General Public License 18 * along with Jalview. If not, see <http://www.gnu.org/licenses/>. 19 * The Jalview Authors are detailed in the 'AUTHORS' file. 20 */ 21 package jalview.io.gff; 22 23 import jalview.analysis.SequenceIdMatcher; 24 import jalview.datamodel.AlignedCodonFrame; 25 import jalview.datamodel.AlignmentI; 26 import jalview.datamodel.MappingType; 27 import jalview.datamodel.SequenceDummy; 28 import jalview.datamodel.SequenceFeature; 29 import jalview.datamodel.SequenceI; 30 import jalview.util.MapList; 31 import jalview.util.StringUtils; 32 33 import java.util.ArrayList; 34 import java.util.Arrays; 35 import java.util.HashMap; 36 import java.util.List; 37 import java.util.Map; 38 import java.util.Map.Entry; 39 40 /** 41 * Base class with common functionality for flavours of GFF handler (GFF2 or 42 * GFF3) 43 */ 44 public abstract class GffHelperBase implements GffHelperI 45 { 46 private static final String INVALID_GFF_ATTRIBUTE_FORMAT = "Invalid GFF attribute format: "; 47 48 protected static final String COMMA = ","; 49 50 protected static final String EQUALS = "="; 51 52 protected static final String NOTE = "Note"; 53 54 /* 55 * GFF columns 1-9 (zero-indexed): 56 */ 57 protected static final int SEQID_COL = 0; 58 59 protected static final int SOURCE_COL = 1; 60 61 protected static final int TYPE_COL = 2; 62 63 protected static final int START_COL = 3; 64 65 protected static final int END_COL = 4; 66 67 protected static final int SCORE_COL = 5; 68 69 protected static final int STRAND_COL = 6; 70 71 protected static final int PHASE_COL = 7; 72 73 protected static final int ATTRIBUTES_COL = 8; 74 75 private AlignmentI lastmatchedAl = null; 76 77 private SequenceIdMatcher matcher = null; 78 79 /** 80 * Constructs and returns a mapping, or null if data appear invalid 81 * 82 * @param fromStart 83 * @param fromEnd 84 * @param toStart 85 * @param toEnd 86 * @param mappingType 87 * type of mapping (e.g. protein to nucleotide) 88 * @return 89 */ constructMappingFromAlign(int fromStart, int fromEnd, int toStart, int toEnd, MappingType mappingType)90 protected MapList constructMappingFromAlign(int fromStart, int fromEnd, 91 int toStart, int toEnd, MappingType mappingType) 92 { 93 int[] from = new int[] { fromStart, fromEnd }; 94 int[] to = new int[] { toStart, toEnd }; 95 96 /* 97 * Jalview always models from dna to protein, so switch values if the 98 * GFF mapping is from protein to dna 99 */ 100 if (mappingType == MappingType.PeptideToNucleotide) 101 { 102 int[] temp = from; 103 from = to; 104 to = temp; 105 mappingType = mappingType.getInverse(); 106 } 107 108 int fromRatio = mappingType.getFromRatio(); 109 int toRatio = mappingType.getToRatio(); 110 111 /* 112 * sanity check that mapped residue counts match 113 * TODO understand why PASA generates such cases... 114 */ 115 if (!trimMapping(from, to, fromRatio, toRatio)) 116 { 117 System.err.println("Ignoring mapping from " + Arrays.toString(from) 118 + " to " + Arrays.toString(to) + " as counts don't match!"); 119 return null; 120 } 121 122 /* 123 * If a codon has an intron gap, there will be contiguous 'toRanges'; 124 * this is handled for us by the MapList constructor. 125 * (It is not clear that exonerate ever generates this case) 126 */ 127 128 return new MapList(from, to, fromRatio, toRatio); 129 } 130 131 /** 132 * Checks that the 'from' and 'to' ranges have equivalent lengths. If not, 133 * tries to trim the end of the longer so they do. Returns true if the 134 * mappings could be made equivalent, else false. Note the range array values 135 * may be modified by this method. 136 * 137 * @param from 138 * @param to 139 * @param fromRatio 140 * @param toRatio 141 * @return 142 */ trimMapping(int[] from, int[] to, int fromRatio, int toRatio)143 protected static boolean trimMapping(int[] from, int[] to, int fromRatio, 144 int toRatio) 145 { 146 int fromLength = Math.abs(from[1] - from[0]) + 1; 147 int toLength = Math.abs(to[1] - to[0]) + 1; 148 int fromOverlap = fromLength * toRatio - toLength * fromRatio; 149 if (fromOverlap == 0) 150 { 151 return true; 152 } 153 if (fromOverlap > 0 && fromOverlap % toRatio == 0) 154 { 155 /* 156 * restrict from range to make them match up 157 * it's kind of arbitrary which end we truncate - here it is the end 158 */ 159 System.err.print( 160 "Truncating mapping from " + Arrays.toString(from) + " to "); 161 if (from[1] > from[0]) 162 { 163 from[1] -= fromOverlap / toRatio; 164 } 165 else 166 { 167 from[1] += fromOverlap / toRatio; 168 } 169 System.err.println(Arrays.toString(from)); 170 return true; 171 } 172 else if (fromOverlap < 0 && fromOverlap % fromRatio == 0) 173 { 174 fromOverlap = -fromOverlap; // > 0 175 /* 176 * restrict to range to make them match up 177 */ 178 System.err.print( 179 "Truncating mapping to " + Arrays.toString(to) + " to "); 180 if (to[1] > to[0]) 181 { 182 to[1] -= fromOverlap / fromRatio; 183 } 184 else 185 { 186 to[1] += fromOverlap / fromRatio; 187 } 188 System.err.println(Arrays.toString(to)); 189 return true; 190 } 191 192 /* 193 * Couldn't truncate to an exact match.. 194 */ 195 return false; 196 } 197 198 /** 199 * Returns a sequence matching the given id, as follows 200 * <ul> 201 * <li>strict matching is on exact sequence name</li> 202 * <li>relaxed matching allows matching on a token within the sequence name, 203 * or a dbxref</li> 204 * <li>first tries to find a match in the alignment sequences</li> 205 * <li>else tries to find a match in the new sequences already generated while 206 * parsing the features file</li> 207 * <li>else creates a new placeholder sequence, adds it to the new sequences 208 * list, and returns it</li> 209 * </ul> 210 * 211 * @param seqId 212 * @param align 213 * @param newseqs 214 * @param relaxedIdMatching 215 * 216 * @return 217 */ findSequence(String seqId, AlignmentI align, List<SequenceI> newseqs, boolean relaxedIdMatching)218 protected SequenceI findSequence(String seqId, AlignmentI align, 219 List<SequenceI> newseqs, boolean relaxedIdMatching) 220 { 221 if (seqId == null) 222 { 223 return null; 224 } 225 SequenceI match = null; 226 if (relaxedIdMatching) 227 { 228 if (lastmatchedAl != align) 229 { 230 lastmatchedAl = align; 231 matcher = new SequenceIdMatcher(align.getSequencesArray()); 232 if (newseqs != null) 233 { 234 matcher.addAll(newseqs); 235 } 236 } 237 match = matcher.findIdMatch(seqId); 238 } 239 else 240 { 241 match = align.findName(seqId, true); 242 if (match == null && newseqs != null) 243 { 244 for (SequenceI m : newseqs) 245 { 246 if (seqId.equals(m.getName())) 247 { 248 return m; 249 } 250 } 251 } 252 253 } 254 if (match == null && newseqs != null) 255 { 256 match = new SequenceDummy(seqId); 257 if (relaxedIdMatching) 258 { 259 matcher.addAll(Arrays.asList(new SequenceI[] { match })); 260 } 261 // add dummy sequence to the newseqs list 262 newseqs.add(match); 263 } 264 return match; 265 } 266 267 /** 268 * Parses the input line to a map of name / value(s) pairs. For example the 269 * line 270 * 271 * <pre> 272 * Notes=Fe-S;Method=manual curation, prediction; source = Pfam; Notes = Metal 273 * </pre> 274 * 275 * if parsed with delimiter=";" and separators {' ', '='} <br> 276 * would return a map with { Notes={Fe=S, Metal}, Method={manual curation, 277 * prediction}, source={Pfam}} <br> 278 * 279 * This method supports parsing of either GFF2 format (which uses space ' ' as 280 * the name/value delimiter, and allows multiple occurrences of the same 281 * name), or GFF3 format (which uses '=' as the name/value delimiter, and 282 * strictly does not allow repeat occurrences of the same name - but does 283 * allow a comma-separated list of values). 284 * <p> 285 * Returns a (possibly empty) map of lists of values by attribute name. 286 * 287 * @param text 288 * @param namesDelimiter 289 * the major delimiter between name-value pairs 290 * @param nameValueSeparator 291 * separator used between name and value 292 * @param valuesDelimiter 293 * delimits a list of more than one value 294 * @return 295 */ parseNameValuePairs(String text, String namesDelimiter, char nameValueSeparator, String valuesDelimiter)296 public static Map<String, List<String>> parseNameValuePairs(String text, 297 String namesDelimiter, char nameValueSeparator, 298 String valuesDelimiter) 299 { 300 Map<String, List<String>> map = new HashMap<>(); 301 if (text == null || text.trim().length() == 0) 302 { 303 return map; 304 } 305 306 /* 307 * split by major delimiter (; for GFF3) 308 */ 309 for (String nameValuePair : text.trim().split(namesDelimiter)) 310 { 311 nameValuePair = nameValuePair.trim(); 312 if (nameValuePair.length() == 0) 313 { 314 continue; 315 } 316 317 /* 318 * find name/value separator (= for GFF3) 319 */ 320 int sepPos = nameValuePair.indexOf(nameValueSeparator); 321 if (sepPos == -1) 322 { 323 // no name=value found 324 continue; 325 } 326 327 String name = nameValuePair.substring(0, sepPos).trim(); 328 String values = nameValuePair.substring(sepPos + 1).trim(); 329 if (values.isEmpty()) 330 { 331 continue; 332 } 333 334 List<String> vals = map.get(name); 335 if (vals == null) 336 { 337 vals = new ArrayList<>(); 338 map.put(name, vals); 339 } 340 341 /* 342 * if 'values' contains more name/value separators, parse as a map 343 * (nested sub-attribute values) 344 */ 345 if (values.indexOf(nameValueSeparator) != -1) 346 { 347 vals.add(values); 348 } 349 else 350 { 351 for (String val : values.split(valuesDelimiter)) 352 { 353 vals.add(val); 354 } 355 } 356 } 357 358 return map; 359 } 360 361 /** 362 * Constructs a SequenceFeature from the GFF column data. Subclasses may wish 363 * to call this method then adjust the SequenceFeature depending on the 364 * particular usage of different tools that generate GFF. 365 * 366 * @param gff 367 * @param attributes 368 * @return 369 */ buildSequenceFeature(String[] gff, Map<String, List<String>> attributes)370 protected SequenceFeature buildSequenceFeature(String[] gff, 371 Map<String, List<String>> attributes) 372 { 373 return buildSequenceFeature(gff, TYPE_COL, gff[SOURCE_COL], attributes); 374 } 375 376 /** 377 * @param gff 378 * @param typeColumn 379 * @param group 380 * @param attributes 381 * @return 382 */ buildSequenceFeature(String[] gff, int typeColumn, String group, Map<String, List<String>> attributes)383 protected SequenceFeature buildSequenceFeature(String[] gff, 384 int typeColumn, String group, Map<String, List<String>> attributes) 385 { 386 try 387 { 388 int start = Integer.parseInt(gff[START_COL]); 389 int end = Integer.parseInt(gff[END_COL]); 390 391 /* 392 * default 'score' is 0 rather than Float.NaN - see JAL-2554 393 */ 394 float score = 0f; 395 try 396 { 397 score = Float.parseFloat(gff[SCORE_COL]); 398 } catch (NumberFormatException nfe) 399 { 400 // e.g. '.' - leave as zero 401 } 402 403 SequenceFeature sf = new SequenceFeature(gff[typeColumn], 404 gff[SOURCE_COL], start, end, score, group); 405 406 sf.setStrand(gff[STRAND_COL]); 407 408 sf.setPhase(gff[PHASE_COL]); 409 410 if (attributes != null) 411 { 412 /* 413 * Add attributes in column 9 to the sequence feature's 414 * 'otherData' table; use Note as a best proxy for description; 415 * decode any encoded comma, equals, semi-colon as per GFF3 spec 416 */ 417 for (Entry<String, List<String>> attr : attributes.entrySet()) 418 { 419 String key = attr.getKey(); 420 List<String> values = attr.getValue(); 421 if (values.size() == 1 && values.get(0).contains(EQUALS)) 422 { 423 /* 424 * 'value' is actually nested subattributes as x=a,y=b,z=c 425 */ 426 Map<String, String> valueMap = parseAttributeMap(values.get(0)); 427 sf.setValue(key, valueMap); 428 } 429 else 430 { 431 String csvValues = StringUtils.listToDelimitedString(values, 432 COMMA); 433 csvValues = StringUtils.urlDecode(csvValues, GFF_ENCODABLE); 434 sf.setValue(key, csvValues); 435 if (NOTE.equals(key)) 436 { 437 sf.setDescription(csvValues); 438 } 439 } 440 } 441 } 442 443 return sf; 444 } catch (NumberFormatException nfe) 445 { 446 System.err.println("Invalid number in gff: " + nfe.getMessage()); 447 return null; 448 } 449 } 450 451 /** 452 * Parses a (GFF3 format) list of comma-separated key=value pairs into a Map 453 * of {@code key, 454 * value} <br> 455 * An input string like {@code a=b,c,d=e,f=g,h} is parsed to 456 * 457 * <pre> 458 * a = "b,c" 459 * d = "e" 460 * f = "g,h" 461 * </pre> 462 * 463 * @param s 464 * 465 * @return 466 */ parseAttributeMap(String s)467 protected static Map<String, String> parseAttributeMap(String s) 468 { 469 Map<String, String> map = new HashMap<>(); 470 String[] fields = s.split(EQUALS); 471 472 /* 473 * format validation 474 */ 475 boolean valid = true; 476 if (fields.length < 2) 477 { 478 /* 479 * need at least A=B here 480 */ 481 valid = false; 482 } 483 else if (fields[0].isEmpty() || fields[0].contains(COMMA)) 484 { 485 /* 486 * A,B=C is not a valid start, nor is =C 487 */ 488 valid = false; 489 } 490 else 491 { 492 for (int i = 1; i < fields.length - 1; i++) 493 { 494 if (fields[i].isEmpty() || !fields[i].contains(COMMA)) 495 { 496 /* 497 * intermediate tokens must include value,name 498 */ 499 valid = false; 500 } 501 } 502 } 503 504 if (!valid) 505 { 506 System.err.println(INVALID_GFF_ATTRIBUTE_FORMAT + s); 507 return map; 508 } 509 510 int i = 0; 511 while (i < fields.length - 1) 512 { 513 boolean lastPair = i == fields.length - 2; 514 String before = fields[i]; 515 String after = fields[i + 1]; 516 517 /* 518 * if 'key' looks like a,b,c then the last token is the 519 * key 520 */ 521 String theKey = before.contains(COMMA) 522 ? before.substring(before.lastIndexOf(COMMA) + 1) 523 : before; 524 525 theKey = theKey.trim(); 526 if (theKey.isEmpty()) 527 { 528 System.err.println(INVALID_GFF_ATTRIBUTE_FORMAT + s); 529 map.clear(); 530 return map; 531 } 532 533 /* 534 * if 'value' looks like a,b,c then all but the last token is the value, 535 * unless this is the last field (no more = to follow), in which case 536 * all of it makes up the value 537 */ 538 String theValue = after.contains(COMMA) && !lastPair 539 ? after.substring(0, after.lastIndexOf(COMMA)) 540 : after; 541 map.put(StringUtils.urlDecode(theKey, GFF_ENCODABLE), 542 StringUtils.urlDecode(theValue, GFF_ENCODABLE)); 543 i += 1; 544 } 545 546 return map; 547 } 548 549 /** 550 * Returns any existing mapping held on the alignment between the given 551 * dataset sequences, or a new one if none found. This is a convenience method 552 * to facilitate processing multiple GFF lines that make up a single 'spliced' 553 * mapping, by extending the first mapping as the others are read. 554 * 555 * @param align 556 * @param fromSeq 557 * @param toSeq 558 * @return 559 */ getMapping(AlignmentI align, SequenceI fromSeq, SequenceI toSeq)560 protected AlignedCodonFrame getMapping(AlignmentI align, 561 SequenceI fromSeq, SequenceI toSeq) 562 { 563 AlignedCodonFrame acf = align.getMapping(fromSeq, toSeq); 564 if (acf == null) 565 { 566 acf = new AlignedCodonFrame(); 567 } 568 return acf; 569 } 570 571 } 572