1 // ==========================================================================
2 //                 SeqAn - The Library for Sequence Analysis
3 // ==========================================================================
4 // Copyright (c) 2006-2018, Knut Reinert, FU Berlin
5 // All rights reserved.
6 //
7 // Redistribution and use in source and binary forms, with or without
8 // modification, are permitted provided that the following conditions are met:
9 //
10 //     * Redistributions of source code must retain the above copyright
11 //       notice, this list of conditions and the following disclaimer.
12 //     * Redistributions in binary form must reproduce the above copyright
13 //       notice, this list of conditions and the following disclaimer in the
14 //       documentation and/or other materials provided with the distribution.
15 //     * Neither the name of Knut Reinert or the FU Berlin nor the names of
16 //       its contributors may be used to endorse or promote products derived
17 //       from this software without specific prior written permission.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 // ARE DISCLAIMED. IN NO EVENT SHALL KNUT REINERT OR THE FU BERLIN BE LIABLE
23 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 // OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
29 // DAMAGE.
30 //
31 // ==========================================================================
32 // Author: Hannes Hauswedell <hannes.hauswedell@fu-berlin.de>
33 // ==========================================================================
34 // Reduced Versions of the 24-letter amino acid alphabet
35 // ==========================================================================
36 
37 #ifndef SEQAN_REDUCED_AMINOACID_CLUSTER_RED_BASE_H_
38 #define SEQAN_REDUCED_AMINOACID_CLUSTER_RED_BASE_H_
39 
40 namespace seqan {
41 
42 // ============================================================================
43 // Forwards
44 // ============================================================================
45 
46 // ============================================================================
47 // Tags, Classes, Enums
48 // ============================================================================
49 
50 // -----------------------------------------------------------------------
51 // Tag ClusterReduction
52 // -----------------------------------------------------------------------
53 
54 /*
55  * @class ClusterReduction
56  * @brief Specialization for @link ReducedAminoAcid @endlink
57  * @headerfile seqan/reduced_aminoacid.h
58  *
59  * @signature template <unsigned char n, unsigned char m = 24, typename TMatrix = Blosum62>
60  * struct ClusterReduction;
61  *
62  * @tparam n the size of the reduced alphabet (between 2 and m-1)
63  * @tparam m size to truncate alphabet to, <b>before</b> clustering
64  * (one of 20, 22, 24; default 24)
65  * @tparam TMatrix Matrix used for clustering (default @link Blosum62 @endlink,
66  * none other supported right now)
67  *
68  * @section WhenToUse
69  *
70  * Use m = 24 when you expect 'X' and '*' in the dataset you reduce
71  * from. This is especially the case on translated genomic reads.
72  *
73  * If you have validated protein sequences, you can use can use m = 20 or
74  * m = 22, which will not include special characters (see
75  * @link AminoAcid @endlink for details).
76  *
77  * @section Background
78  *
79  * The method employed for reducing the alphabet is similar to Murphy et al,
80  * 2000, <a href="http://www.ncbi.nlm.nih.gov/pubmed/10775656">http://www.ncbi.nlm.nih.gov/pubmed/10775656</a>
81  *
82  * Correlation coefficients for the Blosum62 scores of all pairs of amino
83  * acids in the alphabet were computed and clustered with WPGMA (using
84  * UPGMA as second criterium when WPGMA yields the same
85  * distance between two clusters).
86  *
87  * <img src="ClusterReduction.png">
88  *
89  * The exact clustering for m = 24.
90  *
91  */
92 
93 template <unsigned char n, unsigned char m = 24, typename TMatrix = Blosum62>
94 struct ClusterReduction;
95 
96 // ============================================================================
97 // Metafunctions
98 // ============================================================================
99 
100 // -----------------------------------------------------------------------
101 // Metafunction ValueSize
102 // -----------------------------------------------------------------------
103 
104 template <unsigned char n, unsigned char m, typename TMatrix>
105 struct ValueSize<
106         SimpleType<unsigned char,
107                    ReducedAminoAcid_<ClusterReduction<n, m, TMatrix> > > >
108 {
109     typedef uint8_t Type;
110     static const Type VALUE = n;
111 };
112 
113 // -----------------------------------------------------------------------
114 // Metafunction BitPerValue
115 // -----------------------------------------------------------------------
116 
117 template <unsigned char n, unsigned char m, typename TMatrix>
118 struct BitsPerValue<
119          SimpleType<unsigned char,
120                     ReducedAminoAcid_<ClusterReduction<n, m, TMatrix> > > >
121 {
122     typedef uint8_t Type;
123     static const Type VALUE = Log2<n>::VALUE;
124 };
125 
126 // -----------------------------------------------------------------------
127 // Translation Tables (implementations see extra files)
128 // -----------------------------------------------------------------------
129 
130 // ============================================================================
131 // Functions
132 // ============================================================================
133 
134 }
135 #endif // def SEQAN_REDUCED_AMINOACID_CLUSTER_RED_BASE_H_
136