1 /*  $Id: su_block_multiple_alignment.hpp 180958 2010-01-14 15:26:51Z satskyse $
2 * ===========================================================================
3 *
4 *                            PUBLIC DOMAIN NOTICE
5 *               National Center for Biotechnology Information
6 *
7 *  This software/database is a "United States Government Work" under the
8 *  terms of the United States Copyright Act.  It was written as part of
9 *  the author's official duties as a United States Government employee and
10 *  thus cannot be copyrighted.  This software/database is freely available
11 *  to the public for use. The National Library of Medicine and the U.S.
12 *  Government have not placed any restriction on its use or reproduction.
13 *
14 *  Although all reasonable efforts have been taken to ensure the accuracy
15 *  and reliability of the software and data, the NLM and the U.S.
16 *  Government do not and cannot warrant the performance or results that
17 *  may be obtained by using this software or data. The NLM and the U.S.
18 *  Government disclaim all warranties, express or implied, including
19 *  warranties of performance, merchantability or fitness for any particular
20 *  purpose.
21 *
22 *  Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Authors:  Paul Thiessen
27 *
28 * File Description:
29 *      Classes to hold alignment data
30 *
31 * ===========================================================================
32 */
33 
34 #ifndef SU_BLOCK_MULTIPLE_ALIGNMENT__HPP
35 #define SU_BLOCK_MULTIPLE_ALIGNMENT__HPP
36 
37 #include <corelib/ncbistl.hpp>
38 #include <corelib/ncbiobj.hpp>
39 
40 #include <objects/seqalign/Seq_align.hpp>
41 
42 #include <list>
43 #include <vector>
44 #include <map>
45 
46 
47 BEGIN_SCOPE(struct_util)
48 
49 class Sequence;
50 class Block;
51 class UngappedAlignedBlock;
52 class UnalignedBlock;
53 class BLAST_Matrix;
54 
55 class NCBI_STRUCTUTIL_EXPORT BlockMultipleAlignment : public ncbi::CObject
56 {
57 public:
58     enum {
59         eUndefined = kMax_UInt
60     };
61 
62     typedef std::vector < const Sequence * > SequenceList;
63     BlockMultipleAlignment(const SequenceList& sequenceList);   // first sequence is master
64     ~BlockMultipleAlignment(void);
65 
66     // add a new aligned block - will be "owned" and deallocated by BlockMultipleAlignment
67     bool AddAlignedBlockAtEnd(UngappedAlignedBlock *newBlock);
68 
69     // these two should be called after all aligned blocks have been added; fills out
70     // unaligned blocks inbetween aligned blocks (and at ends). Also sets length.
71     bool AddUnalignedBlocks(void);
72 
73     // Fills out the BlockMap for mapping alignment column -> block+column, special colors,
74     // and sets up conservation colors (although they're not calculated until needed).
75     bool UpdateBlockMap(bool clearRowInfo = true);
76 
77     // find out if a residue is aligned, by row
78     bool IsAligned(unsigned int row, unsigned int seqIndex) const;
79 
80     // stuff regarding master sequence
GetMaster(void) const81     const Sequence * GetMaster(void) const { return m_sequences[0]; }
IsMaster(const Sequence * sequence) const82     bool IsMaster(const Sequence *sequence) const { return (sequence == m_sequences[0]); }
83 
84     // return sequence for given row
GetSequenceOfRow(unsigned int row) const85     const Sequence * GetSequenceOfRow(unsigned int row) const
86     {
87         if (row < m_sequences.size())
88             return m_sequences[row];
89         else
90             return NULL;
91     }
92 
93     // will be used to control padding of unaligned blocks
94     enum eUnalignedJustification {
95         eLeft,
96         eRight,
97         eCenter,
98         eSplit
99     };
100 
101     // return alignment position of left side first aligned block (eUndefined if no aligned blocks)
102     unsigned int GetFirstAlignedBlockPosition(void) const;
103 
104     // makes a new copy of itself
105     BlockMultipleAlignment * Clone(void) const;
106 
107     // character query interface - "column" must be in alignment range [0 .. AlignmentWidth()-1]
108     bool GetCharacterAt(unsigned int alignmentColumn, unsigned int row,
109         eUnalignedJustification justification, char *character) const;
110 
111     // get sequence and index (if any) at given position, and whether that residue is aligned
112     bool GetSequenceAndIndexAt(unsigned int alignmentColumn, unsigned int row,
113         eUnalignedJustification justification,
114         const Sequence **sequence, unsigned int *index, bool *isAligned) const;
115 
116     // given row and sequence index, return alignment index; not the most efficient function - use sparingly
117     unsigned int GetAlignmentIndex(unsigned int row, unsigned int seqIndex, eUnalignedJustification justification);
118 
119     // get a vector of all Blocks
120     typedef std::vector < ncbi::CConstRef < Block > > ConstBlockList;
121     void GetBlockList(ConstBlockList& cbl) const;
122 
123     // fill in a vector of UngappedAlignedBlocks
124     typedef std::vector < const UngappedAlignedBlock * > UngappedAlignedBlockList;
125     void GetUngappedAlignedBlocks(UngappedAlignedBlockList *blocks) const;
126     typedef std::vector < UngappedAlignedBlock * > ModifiableUngappedAlignedBlockList;
127     void GetModifiableUngappedAlignedBlocks(ModifiableUngappedAlignedBlockList *blocks);
128 
129     // PSSM for this alignment (cached)
130     const BLAST_Matrix * GetPSSM(void) const;
131     void RemovePSSM(void) const;
132 
133     // NULL if block before is aligned; if NULL passed, retrieves last block (if unaligned; else NULL)
134     const UnalignedBlock * GetUnalignedBlockBefore(const UngappedAlignedBlock *aBlock) const;
135 
NBlocks(void) const136     unsigned int NBlocks(void) const { return m_blocks.size(); }
137     unsigned int NAlignedBlocks(void) const;
NRows(void) const138     unsigned int NRows(void) const { return m_sequences.size(); }
AlignmentWidth(void) const139     unsigned int AlignmentWidth(void) const { return m_totalWidth; }
140 
141     // return a number from 1..n for aligned blocks, eUndefined for unaligned
GetAlignedBlockNumber(unsigned int alignmentIndex) const142     unsigned int GetAlignedBlockNumber(unsigned int alignmentIndex) const
143         { return m_blockMap[alignmentIndex].alignedBlockNum; }
144 
145     // for storing/querying info
GetRowDouble(unsigned int row) const146     double GetRowDouble(unsigned int row) const { return m_rowDoubles[row]; }
SetRowDouble(unsigned int row,double value) const147     void SetRowDouble(unsigned int row, double value) const { m_rowDoubles[row] = value; }
GetRowStatusLine(unsigned int row) const148     const std::string& GetRowStatusLine(unsigned int row) const { return m_rowStrings[row]; }
SetRowStatusLine(unsigned int row,const std::string & value) const149     void SetRowStatusLine(unsigned int row, const std::string& value) const { m_rowStrings[row] = value; }
150 
151     // empties all rows' infos
ClearRowInfo(void) const152     void ClearRowInfo(void) const
153     {
154         for (unsigned int r=0; r<NRows(); ++r) {
155             m_rowDoubles[r] = 0.0;
156             m_rowStrings[r].erase();
157         }
158     }
159 
160 
161     ///// editing functions /////
162 
163     // if in an aligned block, give block column and width of that position; otherwise eUndefined
164     void GetAlignedBlockPosition(unsigned int alignmentIndex,
165         unsigned int *blockColumn, unsigned int *blockWidth) const;
166 
167     // get seqIndex of slave aligned to the given master seqIndex; eUndefined if master residue unaligned
168     unsigned int GetAlignedSlaveIndex(unsigned int masterSeqIndex, unsigned int slaveRow) const;
169 
170     // returns true if any boundary shift actually occurred
171     bool MoveBlockBoundary(unsigned int columnFrom, unsigned int columnTo);
172 
173     // splits a block such that alignmentIndex is the first column of the new block;
174     // returns false if no split occurred (e.g. if index is not inside aligned block)
175     bool SplitBlock(unsigned int alignmentIndex);
176 
177     // merges all blocks that overlap specified range - assuming no unaligned blocks
178     // in that range. Returns true if any merge(s) occurred, false otherwise.
179     bool MergeBlocks(unsigned int fromAlignmentIndex, unsigned int toAlignmentIndex);
180 
181     // creates a block, if given region of an unaligned block in which no gaps
182     // are present. Returns true if a block is created.
183     bool CreateBlock(unsigned int fromAlignmentIndex, unsigned int toAlignmentIndex,
184         eUnalignedJustification justification);
185 
186     // deletes the block containing this index; returns true if deletion occurred.
187     bool DeleteBlock(unsigned int alignmentIndex);
188 
189     // deletes all blocks; returns true if there were any blocks to delete
190     bool DeleteAllBlocks(void);
191 
192     // shifts (horizontally) the residues in and immediately surrounding an
193     // aligned block; returns true if any shift occurs.
194     bool ShiftRow(unsigned int row, unsigned int fromAlignmentIndex, unsigned int toAlignmentIndex,
195         eUnalignedJustification justification);
196 
197     // delete a row; returns true if successful
198     bool DeleteRow(unsigned int row);
199 
200     // this function does two things: it extracts from a multiple alignment all slave rows listed for
201     // removal; and if pairwiseAlignments!=NULL, for each slave removed creates a new BlockMultipleAlignment
202     // that contains the alignment of just that slave with the master, as it was in the original multiple
203     // (i.e., not according to the corresponding pre-IBM MasterSlaveAlignment)
204     typedef std::list < BlockMultipleAlignment * > AlignmentList;
205     bool ExtractRows(const std::vector < unsigned int >& slavesToRemove, AlignmentList *pairwiseAlignments);
206 
207     // merge in the contents of the given alignment (assuming same master, compatible block structure),
208     // addings its rows to the end of this alignment; returns true if merge successful. Does not change
209     // block structure - just adds the part of new alignment's aligned blocks that intersect with this
210     // object's aligned blocks
211     bool MergeAlignment(const BlockMultipleAlignment *newAlignment);
212 
213     // reorder rows according to newOrder; returns true on success
214     bool ReorderRows(const std::vector < unsigned int >& newOrder);
215 
216 private:
217     SequenceList m_sequences;
218 
219     typedef std::list < ncbi::CRef < Block > > BlockList;
220     BlockList m_blocks;
221 
222     struct BlockInfo {
223         Block *block;
224         int blockColumn, alignedBlockNum;
225     };
226     typedef std::vector < BlockInfo > BlockMap;
227     BlockMap m_blockMap;
228 
229     unsigned int m_totalWidth;
230 
231     bool CheckAlignedBlock(const Block *newBlock) const;
232     UnalignedBlock * CreateNewUnalignedBlockBetween(const Block *left, const Block *right);
233     Block * GetBlockBefore(const Block *block);
234     Block * GetBlockAfter(const Block *block);
235     const Block * GetBlockBefore(const Block *block) const;
236     const Block * GetBlockAfter(const Block *block) const;
237     void InsertBlockBefore(Block *newBlock, const Block *insertAt);
238     void InsertBlockAfter(const Block *insertAt, Block *newBlock);
239     void RemoveBlock(Block *block);
240 
241     // for cacheing of residue->block lookups
242     mutable unsigned int m_cachePrevRow;
243     mutable const Block *m_cachePrevBlock;
244     mutable BlockList::const_iterator m_cacheBlockIterator;
245     void InitCache(void);
246 
247     // given a row and seqIndex, find block that contains that residue
248     const Block * GetBlock(unsigned int row, unsigned int seqIndex) const;
249 
250     // intended for volatile storage of row-associated info (e.g. for alignment scores, etc.)
251     mutable std::vector < double > m_rowDoubles;
252     mutable std::vector < std::string > m_rowStrings;
253 
254     // associated PSSM
255     mutable BLAST_Matrix *m_pssm;
256 };
257 
258 
259 // static function to create Seq-aligns out of multiple
260 NCBI_STRUCTUTIL_EXPORT
261 ncbi::objects::CSeq_align * CreatePairwiseSeqAlignFromMultipleRow(const BlockMultipleAlignment *multiple,
262     const BlockMultipleAlignment::UngappedAlignedBlockList& blocks, unsigned int slaveRow);
263 
264 
265 // base class for Block - BlockMultipleAlignment is made up of a list of these
266 class NCBI_STRUCTUTIL_EXPORT Block : public ncbi::CObject
267 {
268 public:
~Block(void)269     virtual ~Block(void) { }    // virtual destructor for base class
270 
271     // makes a new copy of itself
272     virtual Block * Clone(const BlockMultipleAlignment *newMultiple) const = 0;
273 
274     unsigned int m_width;
275 
276     virtual bool IsAligned(void) const = 0;
277 
278     // get sequence index for a column, which must be in block range (0 ... width-1)
279     virtual unsigned int GetIndexAt(unsigned int blockColumn, unsigned int row,
280         BlockMultipleAlignment::eUnalignedJustification justification) const = 0;
281 
282     // delete a row
283     virtual void DeleteRow(unsigned int row) = 0;
284     virtual void DeleteRows(std::vector < bool >& removeRows, unsigned int nToRemove) = 0;
285 
IsFrom(const BlockMultipleAlignment * alignment) const286     bool IsFrom(const BlockMultipleAlignment *alignment) const { return (alignment == m_parentAlignment); }
287 
288     // given a row number (from 0 ... nSequences-1), give the sequence range covered by this block
289     struct Range {
290         int from, to;
291     };
292 
GetRangeOfRow(int row) const293     const Range* GetRangeOfRow(int row) const { return &(m_ranges[row]); }
SetRangeOfRow(unsigned int row,int from,int to)294     void SetRangeOfRow(unsigned int row, int from, int to)
295     {
296         m_ranges[row].from = from;
297         m_ranges[row].to = to;
298     }
299 
300     // resize - add new (empty) rows at end
AddRows(unsigned int nNewRows)301     void AddRows(unsigned int nNewRows) { m_ranges.resize(m_ranges.size() + nNewRows); }
302 
NSequences(void) const303     unsigned int NSequences(void) const { return m_ranges.size(); }
304 
305     // reorder rows according to newOrder; returns true on success
306     bool ReorderRows(const std::vector < unsigned int >& newOrder);
307 
308 protected:
Block(const BlockMultipleAlignment * multiple)309     Block(const BlockMultipleAlignment *multiple) :
310         m_parentAlignment(multiple), m_ranges(multiple->NRows()) { }
311 
312     const BlockMultipleAlignment *m_parentAlignment;
313 
314     typedef std::vector < Range > RangeList;
315     RangeList m_ranges;
316 };
317 
318 
319 // a gapless aligned block; width must be >= 1
320 class NCBI_STRUCTUTIL_EXPORT UngappedAlignedBlock : public Block
321 {
322 public:
UngappedAlignedBlock(const BlockMultipleAlignment * multiple)323     UngappedAlignedBlock(const BlockMultipleAlignment *multiple) : Block(multiple) { }
324 
IsAligned(void) const325     bool IsAligned(void) const { return true; }
326 
GetIndexAt(unsigned int blockColumn,unsigned int row,BlockMultipleAlignment::eUnalignedJustification justification=BlockMultipleAlignment::eCenter) const327     unsigned int GetIndexAt(unsigned int blockColumn, unsigned int row,
328         BlockMultipleAlignment::eUnalignedJustification justification =
329             BlockMultipleAlignment::eCenter) const  // justification ignored for aligned block
330         { return (GetRangeOfRow(row)->from + blockColumn); }
331 
332     char GetCharacterAt(unsigned int blockColumn, unsigned int row) const;
333 
334     void DeleteRow(unsigned int row);
335     void DeleteRows(std::vector < bool >& removeRows, unsigned int nToRemove);
336 
337     Block * Clone(const BlockMultipleAlignment *newMultiple) const;
338 };
339 
340 
341 // an unaligned block; max width of block must be >=1. But range over any given
342 // sequence can be length 0 (but not <0); if length 0, "to" is the residue before
343 // the block, and "from" (= to+1) is the residue after.
344 class NCBI_STRUCTUTIL_EXPORT UnalignedBlock : public Block
345 {
346 public:
UnalignedBlock(const BlockMultipleAlignment * multiple)347     UnalignedBlock(const BlockMultipleAlignment *multiple) : Block(multiple) { }
348 
349     void Resize(void);  // recompute block width from current ranges
350 
IsAligned(void) const351     bool IsAligned(void) const { return false; }
352 
353     unsigned int GetIndexAt(unsigned int blockColumn, unsigned int row,
354         BlockMultipleAlignment::eUnalignedJustification justification) const;
355 
356     void DeleteRow(unsigned int row);
357     void DeleteRows(std::vector < bool >& removeRows, unsigned int nToRemove);
358 
359     Block * Clone(const BlockMultipleAlignment *newMultiple) const;
360 };
361 
362 END_SCOPE(struct_util)
363 
364 #endif // SU_BLOCK_MULTIPLE_ALIGNMENT__HPP
365