1 /* $Id: su_block_multiple_alignment.hpp 180958 2010-01-14 15:26:51Z satskyse $ 2 * =========================================================================== 3 * 4 * PUBLIC DOMAIN NOTICE 5 * National Center for Biotechnology Information 6 * 7 * This software/database is a "United States Government Work" under the 8 * terms of the United States Copyright Act. It was written as part of 9 * the author's official duties as a United States Government employee and 10 * thus cannot be copyrighted. This software/database is freely available 11 * to the public for use. The National Library of Medicine and the U.S. 12 * Government have not placed any restriction on its use or reproduction. 13 * 14 * Although all reasonable efforts have been taken to ensure the accuracy 15 * and reliability of the software and data, the NLM and the U.S. 16 * Government do not and cannot warrant the performance or results that 17 * may be obtained by using this software or data. The NLM and the U.S. 18 * Government disclaim all warranties, express or implied, including 19 * warranties of performance, merchantability or fitness for any particular 20 * purpose. 21 * 22 * Please cite the author in any work or product based on this material. 23 * 24 * =========================================================================== 25 * 26 * Authors: Paul Thiessen 27 * 28 * File Description: 29 * Classes to hold alignment data 30 * 31 * =========================================================================== 32 */ 33 34 #ifndef SU_BLOCK_MULTIPLE_ALIGNMENT__HPP 35 #define SU_BLOCK_MULTIPLE_ALIGNMENT__HPP 36 37 #include <corelib/ncbistl.hpp> 38 #include <corelib/ncbiobj.hpp> 39 40 #include <objects/seqalign/Seq_align.hpp> 41 42 #include <list> 43 #include <vector> 44 #include <map> 45 46 47 BEGIN_SCOPE(struct_util) 48 49 class Sequence; 50 class Block; 51 class UngappedAlignedBlock; 52 class UnalignedBlock; 53 class BLAST_Matrix; 54 55 class NCBI_STRUCTUTIL_EXPORT BlockMultipleAlignment : public ncbi::CObject 56 { 57 public: 58 enum { 59 eUndefined = kMax_UInt 60 }; 61 62 typedef std::vector < const Sequence * > SequenceList; 63 BlockMultipleAlignment(const SequenceList& sequenceList); // first sequence is master 64 ~BlockMultipleAlignment(void); 65 66 // add a new aligned block - will be "owned" and deallocated by BlockMultipleAlignment 67 bool AddAlignedBlockAtEnd(UngappedAlignedBlock *newBlock); 68 69 // these two should be called after all aligned blocks have been added; fills out 70 // unaligned blocks inbetween aligned blocks (and at ends). Also sets length. 71 bool AddUnalignedBlocks(void); 72 73 // Fills out the BlockMap for mapping alignment column -> block+column, special colors, 74 // and sets up conservation colors (although they're not calculated until needed). 75 bool UpdateBlockMap(bool clearRowInfo = true); 76 77 // find out if a residue is aligned, by row 78 bool IsAligned(unsigned int row, unsigned int seqIndex) const; 79 80 // stuff regarding master sequence GetMaster(void) const81 const Sequence * GetMaster(void) const { return m_sequences[0]; } IsMaster(const Sequence * sequence) const82 bool IsMaster(const Sequence *sequence) const { return (sequence == m_sequences[0]); } 83 84 // return sequence for given row GetSequenceOfRow(unsigned int row) const85 const Sequence * GetSequenceOfRow(unsigned int row) const 86 { 87 if (row < m_sequences.size()) 88 return m_sequences[row]; 89 else 90 return NULL; 91 } 92 93 // will be used to control padding of unaligned blocks 94 enum eUnalignedJustification { 95 eLeft, 96 eRight, 97 eCenter, 98 eSplit 99 }; 100 101 // return alignment position of left side first aligned block (eUndefined if no aligned blocks) 102 unsigned int GetFirstAlignedBlockPosition(void) const; 103 104 // makes a new copy of itself 105 BlockMultipleAlignment * Clone(void) const; 106 107 // character query interface - "column" must be in alignment range [0 .. AlignmentWidth()-1] 108 bool GetCharacterAt(unsigned int alignmentColumn, unsigned int row, 109 eUnalignedJustification justification, char *character) const; 110 111 // get sequence and index (if any) at given position, and whether that residue is aligned 112 bool GetSequenceAndIndexAt(unsigned int alignmentColumn, unsigned int row, 113 eUnalignedJustification justification, 114 const Sequence **sequence, unsigned int *index, bool *isAligned) const; 115 116 // given row and sequence index, return alignment index; not the most efficient function - use sparingly 117 unsigned int GetAlignmentIndex(unsigned int row, unsigned int seqIndex, eUnalignedJustification justification); 118 119 // get a vector of all Blocks 120 typedef std::vector < ncbi::CConstRef < Block > > ConstBlockList; 121 void GetBlockList(ConstBlockList& cbl) const; 122 123 // fill in a vector of UngappedAlignedBlocks 124 typedef std::vector < const UngappedAlignedBlock * > UngappedAlignedBlockList; 125 void GetUngappedAlignedBlocks(UngappedAlignedBlockList *blocks) const; 126 typedef std::vector < UngappedAlignedBlock * > ModifiableUngappedAlignedBlockList; 127 void GetModifiableUngappedAlignedBlocks(ModifiableUngappedAlignedBlockList *blocks); 128 129 // PSSM for this alignment (cached) 130 const BLAST_Matrix * GetPSSM(void) const; 131 void RemovePSSM(void) const; 132 133 // NULL if block before is aligned; if NULL passed, retrieves last block (if unaligned; else NULL) 134 const UnalignedBlock * GetUnalignedBlockBefore(const UngappedAlignedBlock *aBlock) const; 135 NBlocks(void) const136 unsigned int NBlocks(void) const { return m_blocks.size(); } 137 unsigned int NAlignedBlocks(void) const; NRows(void) const138 unsigned int NRows(void) const { return m_sequences.size(); } AlignmentWidth(void) const139 unsigned int AlignmentWidth(void) const { return m_totalWidth; } 140 141 // return a number from 1..n for aligned blocks, eUndefined for unaligned GetAlignedBlockNumber(unsigned int alignmentIndex) const142 unsigned int GetAlignedBlockNumber(unsigned int alignmentIndex) const 143 { return m_blockMap[alignmentIndex].alignedBlockNum; } 144 145 // for storing/querying info GetRowDouble(unsigned int row) const146 double GetRowDouble(unsigned int row) const { return m_rowDoubles[row]; } SetRowDouble(unsigned int row,double value) const147 void SetRowDouble(unsigned int row, double value) const { m_rowDoubles[row] = value; } GetRowStatusLine(unsigned int row) const148 const std::string& GetRowStatusLine(unsigned int row) const { return m_rowStrings[row]; } SetRowStatusLine(unsigned int row,const std::string & value) const149 void SetRowStatusLine(unsigned int row, const std::string& value) const { m_rowStrings[row] = value; } 150 151 // empties all rows' infos ClearRowInfo(void) const152 void ClearRowInfo(void) const 153 { 154 for (unsigned int r=0; r<NRows(); ++r) { 155 m_rowDoubles[r] = 0.0; 156 m_rowStrings[r].erase(); 157 } 158 } 159 160 161 ///// editing functions ///// 162 163 // if in an aligned block, give block column and width of that position; otherwise eUndefined 164 void GetAlignedBlockPosition(unsigned int alignmentIndex, 165 unsigned int *blockColumn, unsigned int *blockWidth) const; 166 167 // get seqIndex of slave aligned to the given master seqIndex; eUndefined if master residue unaligned 168 unsigned int GetAlignedSlaveIndex(unsigned int masterSeqIndex, unsigned int slaveRow) const; 169 170 // returns true if any boundary shift actually occurred 171 bool MoveBlockBoundary(unsigned int columnFrom, unsigned int columnTo); 172 173 // splits a block such that alignmentIndex is the first column of the new block; 174 // returns false if no split occurred (e.g. if index is not inside aligned block) 175 bool SplitBlock(unsigned int alignmentIndex); 176 177 // merges all blocks that overlap specified range - assuming no unaligned blocks 178 // in that range. Returns true if any merge(s) occurred, false otherwise. 179 bool MergeBlocks(unsigned int fromAlignmentIndex, unsigned int toAlignmentIndex); 180 181 // creates a block, if given region of an unaligned block in which no gaps 182 // are present. Returns true if a block is created. 183 bool CreateBlock(unsigned int fromAlignmentIndex, unsigned int toAlignmentIndex, 184 eUnalignedJustification justification); 185 186 // deletes the block containing this index; returns true if deletion occurred. 187 bool DeleteBlock(unsigned int alignmentIndex); 188 189 // deletes all blocks; returns true if there were any blocks to delete 190 bool DeleteAllBlocks(void); 191 192 // shifts (horizontally) the residues in and immediately surrounding an 193 // aligned block; returns true if any shift occurs. 194 bool ShiftRow(unsigned int row, unsigned int fromAlignmentIndex, unsigned int toAlignmentIndex, 195 eUnalignedJustification justification); 196 197 // delete a row; returns true if successful 198 bool DeleteRow(unsigned int row); 199 200 // this function does two things: it extracts from a multiple alignment all slave rows listed for 201 // removal; and if pairwiseAlignments!=NULL, for each slave removed creates a new BlockMultipleAlignment 202 // that contains the alignment of just that slave with the master, as it was in the original multiple 203 // (i.e., not according to the corresponding pre-IBM MasterSlaveAlignment) 204 typedef std::list < BlockMultipleAlignment * > AlignmentList; 205 bool ExtractRows(const std::vector < unsigned int >& slavesToRemove, AlignmentList *pairwiseAlignments); 206 207 // merge in the contents of the given alignment (assuming same master, compatible block structure), 208 // addings its rows to the end of this alignment; returns true if merge successful. Does not change 209 // block structure - just adds the part of new alignment's aligned blocks that intersect with this 210 // object's aligned blocks 211 bool MergeAlignment(const BlockMultipleAlignment *newAlignment); 212 213 // reorder rows according to newOrder; returns true on success 214 bool ReorderRows(const std::vector < unsigned int >& newOrder); 215 216 private: 217 SequenceList m_sequences; 218 219 typedef std::list < ncbi::CRef < Block > > BlockList; 220 BlockList m_blocks; 221 222 struct BlockInfo { 223 Block *block; 224 int blockColumn, alignedBlockNum; 225 }; 226 typedef std::vector < BlockInfo > BlockMap; 227 BlockMap m_blockMap; 228 229 unsigned int m_totalWidth; 230 231 bool CheckAlignedBlock(const Block *newBlock) const; 232 UnalignedBlock * CreateNewUnalignedBlockBetween(const Block *left, const Block *right); 233 Block * GetBlockBefore(const Block *block); 234 Block * GetBlockAfter(const Block *block); 235 const Block * GetBlockBefore(const Block *block) const; 236 const Block * GetBlockAfter(const Block *block) const; 237 void InsertBlockBefore(Block *newBlock, const Block *insertAt); 238 void InsertBlockAfter(const Block *insertAt, Block *newBlock); 239 void RemoveBlock(Block *block); 240 241 // for cacheing of residue->block lookups 242 mutable unsigned int m_cachePrevRow; 243 mutable const Block *m_cachePrevBlock; 244 mutable BlockList::const_iterator m_cacheBlockIterator; 245 void InitCache(void); 246 247 // given a row and seqIndex, find block that contains that residue 248 const Block * GetBlock(unsigned int row, unsigned int seqIndex) const; 249 250 // intended for volatile storage of row-associated info (e.g. for alignment scores, etc.) 251 mutable std::vector < double > m_rowDoubles; 252 mutable std::vector < std::string > m_rowStrings; 253 254 // associated PSSM 255 mutable BLAST_Matrix *m_pssm; 256 }; 257 258 259 // static function to create Seq-aligns out of multiple 260 NCBI_STRUCTUTIL_EXPORT 261 ncbi::objects::CSeq_align * CreatePairwiseSeqAlignFromMultipleRow(const BlockMultipleAlignment *multiple, 262 const BlockMultipleAlignment::UngappedAlignedBlockList& blocks, unsigned int slaveRow); 263 264 265 // base class for Block - BlockMultipleAlignment is made up of a list of these 266 class NCBI_STRUCTUTIL_EXPORT Block : public ncbi::CObject 267 { 268 public: ~Block(void)269 virtual ~Block(void) { } // virtual destructor for base class 270 271 // makes a new copy of itself 272 virtual Block * Clone(const BlockMultipleAlignment *newMultiple) const = 0; 273 274 unsigned int m_width; 275 276 virtual bool IsAligned(void) const = 0; 277 278 // get sequence index for a column, which must be in block range (0 ... width-1) 279 virtual unsigned int GetIndexAt(unsigned int blockColumn, unsigned int row, 280 BlockMultipleAlignment::eUnalignedJustification justification) const = 0; 281 282 // delete a row 283 virtual void DeleteRow(unsigned int row) = 0; 284 virtual void DeleteRows(std::vector < bool >& removeRows, unsigned int nToRemove) = 0; 285 IsFrom(const BlockMultipleAlignment * alignment) const286 bool IsFrom(const BlockMultipleAlignment *alignment) const { return (alignment == m_parentAlignment); } 287 288 // given a row number (from 0 ... nSequences-1), give the sequence range covered by this block 289 struct Range { 290 int from, to; 291 }; 292 GetRangeOfRow(int row) const293 const Range* GetRangeOfRow(int row) const { return &(m_ranges[row]); } SetRangeOfRow(unsigned int row,int from,int to)294 void SetRangeOfRow(unsigned int row, int from, int to) 295 { 296 m_ranges[row].from = from; 297 m_ranges[row].to = to; 298 } 299 300 // resize - add new (empty) rows at end AddRows(unsigned int nNewRows)301 void AddRows(unsigned int nNewRows) { m_ranges.resize(m_ranges.size() + nNewRows); } 302 NSequences(void) const303 unsigned int NSequences(void) const { return m_ranges.size(); } 304 305 // reorder rows according to newOrder; returns true on success 306 bool ReorderRows(const std::vector < unsigned int >& newOrder); 307 308 protected: Block(const BlockMultipleAlignment * multiple)309 Block(const BlockMultipleAlignment *multiple) : 310 m_parentAlignment(multiple), m_ranges(multiple->NRows()) { } 311 312 const BlockMultipleAlignment *m_parentAlignment; 313 314 typedef std::vector < Range > RangeList; 315 RangeList m_ranges; 316 }; 317 318 319 // a gapless aligned block; width must be >= 1 320 class NCBI_STRUCTUTIL_EXPORT UngappedAlignedBlock : public Block 321 { 322 public: UngappedAlignedBlock(const BlockMultipleAlignment * multiple)323 UngappedAlignedBlock(const BlockMultipleAlignment *multiple) : Block(multiple) { } 324 IsAligned(void) const325 bool IsAligned(void) const { return true; } 326 GetIndexAt(unsigned int blockColumn,unsigned int row,BlockMultipleAlignment::eUnalignedJustification justification=BlockMultipleAlignment::eCenter) const327 unsigned int GetIndexAt(unsigned int blockColumn, unsigned int row, 328 BlockMultipleAlignment::eUnalignedJustification justification = 329 BlockMultipleAlignment::eCenter) const // justification ignored for aligned block 330 { return (GetRangeOfRow(row)->from + blockColumn); } 331 332 char GetCharacterAt(unsigned int blockColumn, unsigned int row) const; 333 334 void DeleteRow(unsigned int row); 335 void DeleteRows(std::vector < bool >& removeRows, unsigned int nToRemove); 336 337 Block * Clone(const BlockMultipleAlignment *newMultiple) const; 338 }; 339 340 341 // an unaligned block; max width of block must be >=1. But range over any given 342 // sequence can be length 0 (but not <0); if length 0, "to" is the residue before 343 // the block, and "from" (= to+1) is the residue after. 344 class NCBI_STRUCTUTIL_EXPORT UnalignedBlock : public Block 345 { 346 public: UnalignedBlock(const BlockMultipleAlignment * multiple)347 UnalignedBlock(const BlockMultipleAlignment *multiple) : Block(multiple) { } 348 349 void Resize(void); // recompute block width from current ranges 350 IsAligned(void) const351 bool IsAligned(void) const { return false; } 352 353 unsigned int GetIndexAt(unsigned int blockColumn, unsigned int row, 354 BlockMultipleAlignment::eUnalignedJustification justification) const; 355 356 void DeleteRow(unsigned int row); 357 void DeleteRows(std::vector < bool >& removeRows, unsigned int nToRemove); 358 359 Block * Clone(const BlockMultipleAlignment *newMultiple) const; 360 }; 361 362 END_SCOPE(struct_util) 363 364 #endif // SU_BLOCK_MULTIPLE_ALIGNMENT__HPP 365