io/gff/ExonerateHelper.java

/*
 * Jalview - A Sequence Alignment Editor and Viewer (2.11.1.4)
 * Copyright (C) 2021 The Jalview Authors
 *
 * This file is part of Jalview.
 *
 * Jalview is free software: you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation, either version 3
 * of the License, or (at your option) any later version.
 *
 * Jalview is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty
 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
 * PURPOSE.  See the GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with Jalview.  If not, see <http://www.gnu.org/licenses/>.
 * The Jalview Authors are detailed in the 'AUTHORS' file.
 */
package jalview.io.gff;

import jalview.datamodel.AlignedCodonFrame;
import jalview.datamodel.AlignmentI;
import jalview.datamodel.MappingType;
import jalview.datamodel.SequenceFeature;
import jalview.datamodel.SequenceI;
import jalview.util.MapList;

import java.io.IOException;
import java.util.List;
import java.util.Map;

/**
 * A handler to parse GFF in the format generated by the exonerate tool
 */
public class ExonerateHelper extends Gff2Helper
{
  private static final String SIMILARITY = "similarity";

  private static final String GENOME2GENOME = "genome2genome";

  private static final String CDNA2GENOME = "cdna2genome";

  private static final String CODING2GENOME = "coding2genome";

  private static final String CODING2CODING = "coding2coding";

  private static final String PROTEIN2GENOME = "protein2genome";

  private static final String PROTEIN2DNA = "protein2dna";

  private static final String ALIGN = "Align";

  private static final String QUERY = "Query";

  private static final String TARGET = "Target";

  /**
   * Process one GFF feature line (as modelled by SequenceFeature)
   *
   * @param seq
   *          the sequence with which this feature is associated
   * @param gffColumns
   *          the sequence feature with ATTRIBUTES property containing any
   *          additional attributes
   * @param align
   *          the alignment we are adding GFF to
   * @param newseqs
   *          any new sequences referenced by the GFF
   * @param relaxedIdMatching
   *          if true, match word tokens in sequence names
   * @return true if the sequence feature should be added to the sequence, else
   *         false (i.e. it has been processed in another way e.g. to generate a
   *         mapping)
   */
  @Override
  public SequenceFeature processGff(SequenceI seq, String[] gffColumns,
          AlignmentI align, List<SequenceI> newseqs,
          boolean relaxedIdMatching)
  {
    String attr = gffColumns[ATTRIBUTES_COL];
    Map<String, List<String>> set = parseNameValuePairs(attr);

    try
    {
      processGffSimilarity(set, seq, gffColumns, align, newseqs,
              relaxedIdMatching);
    } catch (IOException ivfe)
    {
      System.err.println(ivfe);
    }

    /*
     * return null to indicate we don't want to add a sequence feature for
     * similarity (only process it to create mappings)
     */
    return null;
  }

  /**
   * Processes the 'Query' (or 'Target') and 'Align' properties associated with
   * an exonerate GFF similarity feature; these properties define the mapping of
   * the annotated range to a related sequence.
   *
   * @param set
   *          parsed GFF column 9 key/value(s)
   * @param seq
   *          the sequence the GFF feature is on
   * @param gff
   *          the GFF column data
   * @param align
   *          the alignment the sequence belongs to, where any new mappings
   *          should be added
   * @param newseqs
   *          a list of new 'virtual sequences' generated while parsing GFF
   * @param relaxedIdMatching
   *          if true allow fuzzy search for a matching target sequence
   * @throws IOException
   */
  protected void processGffSimilarity(Map<String, List<String>> set,
          SequenceI seq, String[] gff, AlignmentI align,
          List<SequenceI> newseqs, boolean relaxedIdMatching)
          throws IOException
  {
    /*
     * exonerate may be run with
     * --showquerygff - outputs 'features on the query' e.g. (protein2genome)
     *     Target <dnaseqid> ; Align proteinStartPos dnaStartPos proteinCount
     * --showtargetgff - outputs 'features on the target' e.g. (protein2genome)
     *     Query <proteinseqid> ; Align dnaStartPos proteinStartPos nucleotideCount
     * where the Align spec may repeat
     */
    // TODO handle coding2coding and similar as well
    boolean featureIsOnTarget = true;
    List<String> mapTo = set.get(QUERY);
    if (mapTo == null)
    {
      mapTo = set.get(TARGET);
      featureIsOnTarget = false;
    }
    MappingType type = getMappingType(gff[SOURCE_COL]);

    if (type == null)
    {
      throw new IOException("Sorry, I don't handle " + gff[SOURCE_COL]);
    }

    if (mapTo == null || mapTo.size() != 1)
    {
      throw new IOException(
              "Expecting exactly one sequence in Query or Target field (got "
                      + mapTo + ")");
    }

    /*
     * locate the mapped sequence in the alignment or 'new' (GFF file) sequences;
     */
    SequenceI mappedSequence = findSequence(mapTo.get(0), align, newseqs,
            relaxedIdMatching);

    /*
     * If mapping is from protein to dna, we store it as dna to protein instead
     */
    SequenceI mapFromSequence = seq;
    SequenceI mapToSequence = mappedSequence;
    if ((type == MappingType.NucleotideToPeptide && featureIsOnTarget)
            || (type == MappingType.PeptideToNucleotide
                    && !featureIsOnTarget))
    {
      mapFromSequence = mappedSequence;
      mapToSequence = seq;
    }

    /*
     * Process the Align maps and create mappings.
     * These may be cdna-genome, cdna-protein, genome-protein.
     * The mapped sequences may or may not be in the alignment
     * (they may be included later in the GFF file).
     */

    /*
     * get any existing mapping for these sequences (or start one),
     * and add this mapped range
     */
    AlignedCodonFrame acf = getMapping(align, mapFromSequence,
            mapToSequence);

    /*
     * exonerate GFF has the strand of the target in column 7
     * (differs from GFF3 which has it in the Target descriptor)
     */
    String strand = gff[STRAND_COL];
    boolean forwardStrand = true;
    if ("-".equals(strand))
    {
      forwardStrand = false;
    }
    else if (!"+".equals(strand))
    {
      System.err.println("Strand must be specified for alignment");
      return;
    }

    List<String> alignedRegions = set.get(ALIGN);
    for (String region : alignedRegions)
    {
      MapList mapping = buildMapping(region, type, forwardStrand,
              featureIsOnTarget, gff);

      if (mapping == null)
      {
        continue;
      }

      acf.addMap(mapFromSequence, mapToSequence, mapping);
    }
    align.addCodonFrame(acf);
  }

  /**
   * Construct the mapping
   *
   * @param region
   * @param type
   * @param forwardStrand
   * @param featureIsOnTarget
   * @param gff
   * @return
   */
  protected MapList buildMapping(String region, MappingType type,
          boolean forwardStrand, boolean featureIsOnTarget, String[] gff)
  {
    /*
     * process one "fromStart toStart fromCount" descriptor
     */
    String[] tokens = region.split(" ");
    if (tokens.length != 3)
    {
      System.err.println("Malformed Align descriptor: " + region);
      return null;
    }

    /*
     * get start/end of from/to mappings
     * if feature is on the target sequence we have to invert the sense
     */
    int alignFromStart;
    int alignToStart;
    int alignCount;
    try
    {
      alignFromStart = Integer.parseInt(tokens[0]);
      alignToStart = Integer.parseInt(tokens[1]);
      alignCount = Integer.parseInt(tokens[2]);
    } catch (NumberFormatException nfe)
    {
      System.err.println(nfe.toString());
      return null;
    }

    int fromStart;
    int fromEnd;
    int toStart;
    int toEnd;

    if (featureIsOnTarget)
    {
      fromStart = alignToStart;
      toStart = alignFromStart;
      toEnd = forwardStrand ? toStart + alignCount - 1
              : toStart - (alignCount - 1);
      int toLength = Math.abs(toEnd - toStart) + 1;
      int fromLength = toLength * type.getFromRatio() / type.getToRatio();
      fromEnd = fromStart + fromLength - 1;
    }
    else
    {
      // we use the 'Align' values here not the feature start/end
      // not clear why they may differ but it seems they can
      fromStart = alignFromStart;
      fromEnd = alignFromStart + alignCount - 1;
      int fromLength = fromEnd - fromStart + 1;
      int toLength = fromLength * type.getToRatio() / type.getFromRatio();
      toStart = alignToStart;
      if (forwardStrand)
      {
        toEnd = toStart + toLength - 1;
      }
      else
      {
        toEnd = toStart - (toLength - 1);
      }
    }

    MapList codonmapping = constructMappingFromAlign(fromStart, fromEnd,
            toStart, toEnd, type);
    return codonmapping;
  }

  /**
   * Returns a MappingType depending on the exonerate 'model' value.
   *
   * @param model
   * @return
   */
  protected static MappingType getMappingType(String model)
  {
    MappingType result = null;

    if (model.contains(PROTEIN2DNA) || model.contains(PROTEIN2GENOME))
    {
      result = MappingType.PeptideToNucleotide;
    }
    else if (model.contains(CODING2CODING) || model.contains(CODING2GENOME)
            || model.contains(CDNA2GENOME) || model.contains(GENOME2GENOME))
    {
      result = MappingType.NucleotideToNucleotide;
    }
    return result;
  }

  /**
   * Tests whether the GFF data looks like it was generated by exonerate, and is
   * a format we are willing to handle
   *
   * @param columns
   * @return
   */
  public static boolean recognises(String[] columns)
  {
    if (!SIMILARITY.equalsIgnoreCase(columns[TYPE_COL]))
    {
      return false;
    }

    /*
     * inspect alignment model
     */
    String model = columns[SOURCE_COL];
    // e.g. exonerate:protein2genome:local
    if (model != null)
    {
      String mdl = model.toLowerCase();
      if (mdl.contains(PROTEIN2DNA) || mdl.contains(PROTEIN2GENOME)
              || mdl.contains(CODING2CODING) || mdl.contains(CODING2GENOME)
              || mdl.contains(CDNA2GENOME) || mdl.contains(GENOME2GENOME))
      {
        return true;
      }
    }
    System.err.println("Sorry, I don't handle exonerate model " + model);
    return false;
  }

  /**
   * An override to set feature group to "exonerate" instead of the default GFF
   * source value (column 2)
   */
  @Override
  protected SequenceFeature buildSequenceFeature(String[] gff,
          Map<String, List<String>> set)
  {
    SequenceFeature sf = super.buildSequenceFeature(gff, TYPE_COL,
            "exonerate", set);

    return sf;
  }

}