1 /**
2  * Author: Mark Larkin
3  *
4  * Copyright (c) 2007 Des Higgins, Julie Thompson and Toby Gibson.
5  */
6 /**
7  * The Alignment class is used to store the alignment that is being constructed.
8  * It also contains other information such as gap penalty masks etc.
9  * An object of this type will be passed by reference to the FileReader. This FileReader
10  * and the FileParsers will then set it up properly from the information given in the file.
11  * I have decided to put everything into vectors, string etc. No more array*'s, gets rid
12  * of the memory allocation problem.
13  *
14  * CHANGE:
15  * Mark Jan 16th 2007. I have changed the pasteSequencesIntoPosition function to allow
16  * explicit pastes into profile2.
17  * Mark 25-1-2007. I have changed the class so that each of the sequences have a unique
18  * identifier. Several functions were changed to allow this.
19  *
20  * 16-02-07,Nigel Brown(EMBL): Added friend NameIterator to allow a caller to
21  * process the name vector.
22  *
23  * 23-03-07,Nigel Brown(EMBL): added testUniqueNames() predicate, which
24  * compares new sequence names with those in the alignment vector BEFORE
25  * appending them.
26  */
27 
28  // NOTE NOTE NOTE Very important! The list of sequences begins from 1 to numSeqs.
29  // This is because of the fact that the code was written in Fortran where arrays begin at
30  // 1. It has become difficult to change this. Ramu has tried before and had problems
31  // so we decided to leave it this way.
32 
33 #ifndef ALIGNMENT_H
34 #define ALIGNMENT_H
35 
36 #include <vector>
37 #include <string>
38 #include <iomanip>
39 #include <exception>
40 #include <stdexcept>
41 #include "Sequence.h"
42 #include "../substitutionMatrix/globalmatrix.h"
43 #include "../general/userparams.h"
44 #include "../general/VectorOutOfRange.h"
45 #include "../general/SequenceNotFoundException.h"
46 
47 
48 // FIXME because this object is used for aligned and unaligned
49 // sequences it would be nice to have a isAligned flag here (AW)
50 
51 using namespace std;
52 
53 namespace clustalw
54 {
55 
56 typedef std::vector<vector <int> > SeqArray;
57 
58 class Alignment
59 {
60     public:
61         /* Functions */
62         Alignment();
63         void addSequences(vector<Sequence>* seqVector);
64         void addSequences(SeqArray* seqVector);
65         void appendSequences(vector<Sequence>* seqVector);
66         vector<Sequence> cutSelectedSequencesFromAlignment(vector<int>* selected);
67         void pasteSequencesIntoPosition(vector<Sequence>* seqVector, int pos,
68                                         bool explicitPasteToProfile2 = false);
69 
resizeSeqArray(int size)70         void resizeSeqArray(int size){seqArray.resize(size); numSeqs = size - 1;
71                                       outputIndex.resize(size - 1); names.resize(size);
72                                       titles.resize(size);};
73         bool addOutputIndex(vector<int>* outputIndexToAdd);
74         bool appendOutputIndex(vector<int>* outputIndexToAppend);
75         void addSecStructMask1(vector<char>* secStructMaskToAdd);
76         void addSecStructMask2(vector<char>* secStructMaskToAdd);
77         void addSeqWeight(vector<int>* _seqWeight);
78         void addGapPenaltyMask1(vector<char>* gapPenaltyMaskToAdd);
79         void addGapPenaltyMask2(vector<char>* gapPenaltyMaskToAdd);
80         vector<char>* getSecStructMask1();
81         vector<char>* getSecStructMask2();
82         const vector<int>* getOutputIndex();
83         vector<char>* getGapPenaltyMask1();
84         vector<char>* getGapPenaltyMask2();
85         void addSecStructName1(string nameToAdd);
86         void addSecStructName2(string nameToAdd);
87         int alignScore(void);
88         int countGaps(int s1, int s2, int l);
89         void resetAlign();
90         void fixGaps();
91         float countid(int s1, int s2);
92 
getSequence(int index)93         const vector<int>* getSequence(int index){return &seqArray[index];}; // For Pairwise!
getSequence(int index)94         const vector<int>* getSequence(int index) const {return &seqArray[index];};
95         const vector<int>* getSequenceFromUniqueId(unsigned long id); // For iteration
getSeqArray()96         const SeqArray* getSeqArray() const {return &seqArray;}; // For multiple align!
getSeqArrayForRealloc()97         SeqArray* getSeqArrayForRealloc(){return &seqArray;};
98         void updateSequence(int index, const vector<int>* seq);
99 
100         bool checkAllNamesDifferent(string *offendingSeq);
101         bool testUniqueNames(vector<Sequence>* seqVector, string *offendingSeq);
102         void clearAlignment();
103         void clearSecStruct1();
104         void clearSecStruct2();
105         void printSequencesAddedInfo();
106 
107         string getSecStructName1();
108         string getSecStructName2();
getNumSeqs()109         int getNumSeqs() const {return numSeqs;};
110         int getMaxNames();
getMaxAlnLength()111         int getMaxAlnLength(){return maxAlignmentLength;};
setMaxAlnLength(int len)112         void setMaxAlnLength(int len){maxAlignmentLength = len;};
113         int getLengthLongestSequence();
114         int getLengthLongestSequence(int firstSeq, int lastSeq);
getSeqLength(int index)115         int getSeqLength(int index) const {return seqArray[index].size() - 1;};
116         int getSecStructMask1Element(int index);
117         int getSecStructMask2Element(int index);
118         int getGapPenaltyMask1Element(int index);
119         int getGapPenaltyMask2Element(int index);
120         int getOutputIndex(int index);
121         int getSeqWeight(int index) const;
getSeqWeights()122         const vector<int>* getSeqWeights() const{return &seqWeight;}
123         string getName(int index);
124         string getTitle(int index);
125         vector<int>* QTcalcHistColumnHeights(int firstSeq, int nSeqs,
126                                            Array2D<int>* exceptionalRes);
127                                             // NOTE July 13, for Qt
128 
129         // NOTE the following functions are to be used when we are doing a profile
130         // alignment. It resets the gaps from fixed.
131         void resetProfile1();
132         void resetProfile2();
133         void resetAllSeqWeights();
134 
135         int searchForString(bool* found, int seq, int beginRes, string search);
136         void removeGapsFromSelectedSeqs(vector<int>* selected);
137         void removeGapOnlyColsFromSelectedSeqs(vector<int>* selected);
138         void removeAllGapOnlyColumns(int fSeq, int lSeq, int profileNum);
139         void setDefaultOutputIndex();
140         bool removeAllOutsideRange(int beginPos, int endPos);
141         bool updateRealignedRange(SeqArray realignedSeqs, int beginPos, int endPos);
142         bool reloadAlignment();
143 
getProfile1NumSeqs()144         int getProfile1NumSeqs(){return profile1NumSeqs;};
setProfile1NumSeqs(int value)145         void setProfile1NumSeqs(int value){profile1NumSeqs = value;}
146         bool isGap(int seq, int col) const;
147         void calculateMaxLengths();
148 
149         /**
150          * The following functions are for the iteration output order.
151          */
152         unsigned long getUniqueId(int seq);
153 
debugPrintArray()154         void debugPrintArray(){debugPrintSeqArray(&seqArray);}
155         void debugPrintSeqArray(SeqArray* arrayToPrint);
156         void debugPrintProfile1();
157         void debugPrintProfile2();
158         void debugPrintOutAlignInfo();
159         void debugPrintAllNames();
160         void debugPrintSequences();
161 
162         /* Attributes */
163 
164         /* Friends */
165         class NameIterator;
166         friend class NameIterator;
167 
168         class NameIterator
169         {
170             private:
171                 Alignment *alignment;
172                 vector<string>::iterator i;
173             public:
174                 void begin(Alignment *alignment);
175                 const string next();
176                 bool end();
177         };
178     private:
179         /* Functions */
180 
181         void addSequencesToVector(vector<Sequence>* seqVector);
182         int getSequenceLength(int index);
183         void sortScores(vector<float>* scores, int f, int l);
184         void swap(vector<float>* scores, int s1, int s2);
185         bool keepPortionOfSeqArray(int beginRangeIndex, int endRangeIndex);
186 
187         void clearSeqArray();
188         /* Attributes */
189         int maxNames;
190         int maxAlignmentLength;
191         int lengthLongestSequence;
192         int numSeqs;
193         vector<int> outputIndex;
194         vector<unsigned long> sequenceIds; // Mark change: To help with output order
195         vector<int> seqWeight;
196         SeqArray seqArray;
197         vector<string> names;
198         vector<string> titles;
199         vector<char> gapPenaltyMask1;
200         vector<char> gapPenaltyMask2;
201         vector<char> secStructMask1;
202         vector<char> secStructMask2;
203         string secStructName1;
204         string secStructName2;
205         vector<int> histogramColumnHeights; // NOTE July 13, for Qt
206         int profile1NumSeqs;
207         int gapPos1, gapPos2;
208 };
209 }
210 #endif
211 
212