1 // ========================================================================== 2 // SeqAn - The Library for Sequence Analysis 3 // ========================================================================== 4 // Copyright (c) 2006-2018, Knut Reinert, FU Berlin 5 // All rights reserved. 6 // 7 // Redistribution and use in source and binary forms, with or without 8 // modification, are permitted provided that the following conditions are met: 9 // 10 // * Redistributions of source code must retain the above copyright 11 // notice, this list of conditions and the following disclaimer. 12 // * Redistributions in binary form must reproduce the above copyright 13 // notice, this list of conditions and the following disclaimer in the 14 // documentation and/or other materials provided with the distribution. 15 // * Neither the name of Knut Reinert or the FU Berlin nor the names of 16 // its contributors may be used to endorse or promote products derived 17 // from this software without specific prior written permission. 18 // 19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 20 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 // ARE DISCLAIMED. IN NO EVENT SHALL KNUT REINERT OR THE FU BERLIN BE LIABLE 23 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 25 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 26 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 // OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH 29 // DAMAGE. 30 // 31 // ========================================================================== 32 // Author: Hannes Hauswedell <hannes.hauswedell@fu-berlin.de> 33 // ========================================================================== 34 // Reduced Versions of the 24-letter amino acid alphabet 35 // ========================================================================== 36 37 #ifndef SEQAN_REDUCED_AMINOACID_CLUSTER_RED_BASE_H_ 38 #define SEQAN_REDUCED_AMINOACID_CLUSTER_RED_BASE_H_ 39 40 namespace seqan { 41 42 // ============================================================================ 43 // Forwards 44 // ============================================================================ 45 46 // ============================================================================ 47 // Tags, Classes, Enums 48 // ============================================================================ 49 50 // ----------------------------------------------------------------------- 51 // Tag ClusterReduction 52 // ----------------------------------------------------------------------- 53 54 /* 55 * @class ClusterReduction 56 * @brief Specialization for @link ReducedAminoAcid @endlink 57 * @headerfile seqan/reduced_aminoacid.h 58 * 59 * @signature template <unsigned char n, unsigned char m = 24, typename TMatrix = Blosum62> 60 * struct ClusterReduction; 61 * 62 * @tparam n the size of the reduced alphabet (between 2 and m-1) 63 * @tparam m size to truncate alphabet to, <b>before</b> clustering 64 * (one of 20, 22, 24; default 24) 65 * @tparam TMatrix Matrix used for clustering (default @link Blosum62 @endlink, 66 * none other supported right now) 67 * 68 * @section WhenToUse 69 * 70 * Use m = 24 when you expect 'X' and '*' in the dataset you reduce 71 * from. This is especially the case on translated genomic reads. 72 * 73 * If you have validated protein sequences, you can use can use m = 20 or 74 * m = 22, which will not include special characters (see 75 * @link AminoAcid @endlink for details). 76 * 77 * @section Background 78 * 79 * The method employed for reducing the alphabet is similar to Murphy et al, 80 * 2000, <a href="http://www.ncbi.nlm.nih.gov/pubmed/10775656">http://www.ncbi.nlm.nih.gov/pubmed/10775656</a> 81 * 82 * Correlation coefficients for the Blosum62 scores of all pairs of amino 83 * acids in the alphabet were computed and clustered with WPGMA (using 84 * UPGMA as second criterium when WPGMA yields the same 85 * distance between two clusters). 86 * 87 * <img src="ClusterReduction.png"> 88 * 89 * The exact clustering for m = 24. 90 * 91 */ 92 93 template <unsigned char n, unsigned char m = 24, typename TMatrix = Blosum62> 94 struct ClusterReduction; 95 96 // ============================================================================ 97 // Metafunctions 98 // ============================================================================ 99 100 // ----------------------------------------------------------------------- 101 // Metafunction ValueSize 102 // ----------------------------------------------------------------------- 103 104 template <unsigned char n, unsigned char m, typename TMatrix> 105 struct ValueSize< 106 SimpleType<unsigned char, 107 ReducedAminoAcid_<ClusterReduction<n, m, TMatrix> > > > 108 { 109 typedef uint8_t Type; 110 static const Type VALUE = n; 111 }; 112 113 // ----------------------------------------------------------------------- 114 // Metafunction BitPerValue 115 // ----------------------------------------------------------------------- 116 117 template <unsigned char n, unsigned char m, typename TMatrix> 118 struct BitsPerValue< 119 SimpleType<unsigned char, 120 ReducedAminoAcid_<ClusterReduction<n, m, TMatrix> > > > 121 { 122 typedef uint8_t Type; 123 static const Type VALUE = Log2<n>::VALUE; 124 }; 125 126 // ----------------------------------------------------------------------- 127 // Translation Tables (implementations see extra files) 128 // ----------------------------------------------------------------------- 129 130 // ============================================================================ 131 // Functions 132 // ============================================================================ 133 134 } 135 #endif // def SEQAN_REDUCED_AMINOACID_CLUSTER_RED_BASE_H_ 136