1 ////////////////////////////////////////////////////////////////////////////////
2 //! \file
3 //! \author Adam M Phillippy
4 //! \date 03/26/2003
5 //!
6 //! \brief Class declarations for the handling of delta alignment files
7 //!
8 //! \see delta.cc
9 ////////////////////////////////////////////////////////////////////////////////
10 
11 #ifndef __DELTA_HH
12 #define __DELTA_HH
13 
14 #include "tigrinc.hh"
15 #include <cassert>
16 #include <string>
17 #include <vector>
18 #include <fstream>
19 #include <cstdlib>
20 #include <iostream>
21 #include <map>
22 
23 
24 const std::string NUCMER_STRING = "NUCMER"; //!< nucmer id string
25 const std::string PROMER_STRING = "PROMER"; //!< promer id string
26 
27 typedef char AlignmentType_t;               //!< type of alignment data
28 const AlignmentType_t NULL_DATA = 0;        //!< unknown alignment data type
29 const AlignmentType_t NUCMER_DATA = 'N';    //!< nucmer alignment data
30 const AlignmentType_t PROMER_DATA = 'P';    //!< promer alignment data
31 
32 typedef unsigned char Dir_t;                //!< directional type
33 const Dir_t FORWARD_DIR = 0;                //!< forward direction
34 const Dir_t REVERSE_DIR = 1;                //!< reverse direction
35 
36 
37 
38 //===================================================== DeltaAlignment_t =======
39 struct DeltaAlignment_t
40 //!< A single delta encoded alignment region
41 {
42   long sR;    //!< start coordinate in the reference
43   long eR;    //!< end coordinate in the reference
44   long sQ;    //!< start coordinate in the reference
45   long eQ;    //!< end coordinate in the reference
46   long idyc;  //!< number of mismatches in the alignment
47   long simc;  //!< number of similarity scores < 1 in the alignment
48   long stpc;  //!< number of stop codons in the alignment
49 
50   float idy;               //!< percent identity [0 - 100]
51   float sim;               //!< percent similarity [0 - 100]
52   float stp;               //!< percent stop codon [0 - 100]
53 
54   std::vector<long> deltas;  //!< delta encoded alignment informaiton
55 
DeltaAlignment_tDeltaAlignment_t56   DeltaAlignment_t ( )
57   {
58     clear ( );
59   }
60 
clearDeltaAlignment_t61   void clear ( )
62   {
63     sR = eR = sQ = eQ = 0;
64     idy = sim = stp = 0;
65     deltas.clear ( );
66   }
67 
68   // Read one alignment
read_nucmerDeltaAlignment_t69   inline bool read_nucmer(std::istream& is, const bool read_deltas = true) { return read(is, false, read_deltas); }
read_promerDeltaAlignment_t70   inline bool read_promer(std::istream& is, const bool read_deltas = true) { return read(is, true, read_deltas); }
71   bool read(std::istream& is, const bool promer, const bool read_deltas = true);
72 };
operator >>(std::istream & is,DeltaAlignment_t & a)73 inline std::istream& operator>>(std::istream& is, DeltaAlignment_t& a) {
74   a.read_nucmer(is);
75   return is;
76 }
operator <<(std::ostream & os,const DeltaAlignment_t & a)77 inline std::ostream& operator<<(std::ostream& os, const DeltaAlignment_t& a) {
78   os << a.sR << ' ' << a.eR << ' ' << a.sQ << ' ' << a.eQ << ' ' << a.idyc << ' ' << a.simc << ' ' << a.stpc << '\n';
79   for(auto d : a.deltas)
80     os << d << '\n';
81   return os;
82 }
83 
84 
85 //===================================================== DeltaRecord_t ==========
86 struct DeltaRecord_t
87 //!< A delta record representing the alignments between two sequences
88 {
89   std::string idR;         //!< reference contig ID
90   std::string idQ;         //!< query contig ID
91   long lenR;  //!< length of the reference contig
92   long lenQ;  //!< length of the query contig
93 
94   std::vector<DeltaAlignment_t> aligns; //!< alignments between the two seqs
95 
DeltaRecord_tDeltaRecord_t96   DeltaRecord_t ( )
97   {
98     clear ( );
99   }
100 
clearDeltaRecord_t101   void clear ( )
102   {
103     idR.erase ( );
104     idQ.erase ( );
105     lenR = lenQ = 0;
106     aligns.clear ( );
107   }
108 
109   bool read(std::istream& is);
110 };
operator >>(std::istream & is,DeltaRecord_t & r)111 inline std::istream& operator>>(std::istream& is, DeltaRecord_t& r) {
112   r.read(is);
113   return is;
114 }
115 
operator <<(std::ostream & os,const DeltaRecord_t & r)116 inline std::ostream& operator<<(std::ostream& os, const DeltaRecord_t& r) {
117   return os << '>' << r.idR << ' ' << r.idQ << ' ' << r.lenR << ' ' << r.lenQ;
118 }
119 
120 //===================================================== DeltaReader_t ==========
121 //! \brief Delta encoded file reader class
122 //!
123 //! Handles the input of delta encoded alignment information for various MUMmer
124 //! utilities. Very basic functionality, can be expanded as necessary...
125 //!
126 //! \see DeltaRecord_t
127 //==============================================================================
128 class DeltaReader_t {
129 
130 private:
131 
132   std::string delta_path_m;      //!< the name of the delta input file
133   std::ifstream delta_stream_m;  //!< the delta file input stream
134   std::string data_type_m;       //!< the type of alignment data
135   std::string reference_path_m;  //!< the name of the reference file
136   std::string query_path_m;      //!< the name of the query file
137   DeltaRecord_t record_m;        //!< the current delta information record
138   bool is_record_m;              //!< there is a valid record in record_m
139   bool is_open_m;                //!< delta stream is open
140 
141 
142   //--------------------------------------------------- readNextAlignment ------
143   //! \brief Reads in a delta encoded alignment
144   //!
145   //! \param align read info into this structure
146   //! \param read_deltas read delta information yes/no
147   //! \pre delta_stream is positioned at the beginning of an alignment header
148   //! \return void
149   //!
150   void readNextAlignment (DeltaAlignment_t & align, const bool read_deltas);
151 
152 
153   //--------------------------------------------------- readNextRecord ---------
154   //! \brief Reads in the next delta record from the delta file
155   //!
156   //! \param read_deltas read delta information yes/no
157   //! \pre delta file must be open
158   //! \return true on success, false on EOF
159   //!
160   bool readNextRecord (const bool read_deltas);
161 
162 
163   //--------------------------------------------------- checkStream ------------
164   //! \brief Check stream status and abort program if an error has occured
165   //!
166   //! \return void
167   //!
checkStream()168   void checkStream ( )
169   {
170     if ( !delta_stream_m.good ( ) )
171       {
172 	std::cerr << "ERROR: Could not parse delta file, "
173 		  << delta_path_m << std::endl;
174 
175 
176         std::cerr << "error no: "
177                   << int(delta_stream_m.rdstate() & std::ifstream::failbit)
178                   << int(delta_stream_m.rdstate() & std::ifstream::badbit)
179                   << int(delta_stream_m.rdstate() & std::ifstream::eofbit)
180                   << std::endl;
181 	exit (-1);
182       }
183   }
184 
185 
186 public:
187   //--------------------------------------------------- DeltaReader_t ----------
188   //! \brief Default constructor
189   //!
190   //! \return void
191   //!
DeltaReader_t()192   DeltaReader_t ( )
193   {
194     is_record_m = false;
195     is_open_m = false;
196   }
197 
198 
199   //--------------------------------------------------- ~DeltaReader_t ---------
200   //! \brief Default destructor
201   //!
202   //! \return void
203   //!
~DeltaReader_t()204   ~DeltaReader_t ( )
205   {
206     close ( );
207   }
208 
209 
210   //--------------------------------------------------- open -------------------
211   //! \brief Opens a delta file by path
212   //!
213   //! \param delta file to open
214   //! \return void
215   //!
216   void open (const std::string & delta_path);
217 
218 
219   //--------------------------------------------------- close ------------------
220   //! \brief Closes any open delta file and resets class members
221   //!
222   //! \return void
223   //!
close()224   void close ( )
225   {
226     delta_path_m.erase ( );
227     delta_stream_m.close ( );
228     data_type_m.erase ( );
229     reference_path_m.erase ( );
230     query_path_m.erase ( );
231     record_m.clear ( );
232     is_record_m = false;
233     is_open_m = false;
234   }
235 
236 
237   //--------------------------------------------------- readNext --------------
238   //! \brief Reads in the next delta record from the delta file
239   //!
240   //! \param read_deltas read delta information yes/no
241   //! \pre delta file must be open
242   //! \return true on success, false on EOF
243   //!
readNext(bool getdeltas=true)244   inline bool readNext (bool getdeltas = true)
245   {
246     return readNextRecord (getdeltas);
247   }
248 
249 
250 
251   //--------------------------------------------------- readNextHeadersOnly ----
252   //! \brief Reads in the next delta record from the delta file
253   //!
254   //! Only reads the alignment header information, does not read in the delta
255   //! information and leaves each alignment structure's delta vector empty.
256   //!
257   //! \pre delta file must be open
258   //! \return true on success, false on EOF
259   //!
readNextHeadersOnly()260   inline bool readNextHeadersOnly ( )
261   {
262     return readNextRecord (false);
263   }
264 
265 
266   //--------------------------------------------------- getRecord --------------
267   //! \brief Returns a reference to the current delta record
268   //!
269   //! \pre readNext( ) was successfully called in advance
270   //! \return true on success, false on failure or end of file
271   //!
getRecord() const272   const DeltaRecord_t & getRecord ( ) const
273   {
274     assert (is_record_m);
275     return record_m;
276   }
277 
278 
279   //--------------------------------------------------- getDeltaPath -----------
280   //! \brief Get the path of the current delta file
281   //!
282   //! \pre delta file is open
283   //! \return the path of the delta file
284   //!
getDeltaPath() const285   const std::string & getDeltaPath ( ) const
286   {
287     assert (is_open_m);
288     return delta_path_m;
289   }
290 
291 
292   //--------------------------------------------------- getDataType ------------
293   //! \brief Get the data type of the current delta file
294   //!
295   //! \pre delta file is open
296   //! \return the data type of the current file
297   //!
getDataType() const298   const std::string & getDataType ( ) const
299   {
300     assert (is_open_m);
301     return data_type_m;
302   }
303 
304 
305   //--------------------------------------------------- getReferencePath -------
306   //! \brief Get the path of the MUMmer reference file
307   //!
308   //! \pre delta file is open
309   //! \return the path of the MUMmer reference file
310   //!
getReferencePath() const311   const std::string & getReferencePath ( ) const
312   {
313     assert (is_open_m);
314     return reference_path_m;
315   }
316 
317 
318   //--------------------------------------------------- getQueryPath -----------
319   //! \brief Get the path of the MUMmer query file
320   //!
321   //! \pre delta file is open
322   //! \return the path of the MUMmer query file
323   //!
getQueryPath() const324   const std::string & getQueryPath ( ) const
325   {
326     assert (is_open_m);
327     return query_path_m;
328   }
329 };
330 
331 
332 
333 //===================================================== SNP_t ==================
334 struct DeltaEdgelet_t;
335 struct DeltaEdge_t;
336 struct DeltaNode_t;
337 struct SNP_t
338      //!< A single nuc/aa poly
339 {
340   long buff;
341   char cQ, cR;
342   long pQ, pR;
343   int conQ, conR;
344   std::string ctxQ, ctxR;
345   DeltaEdgelet_t * lp;
346   DeltaEdge_t * ep;
347 
SNP_tSNP_t348   SNP_t ( )
349   {
350     cQ = cR = 0;
351     buff = pQ = pR = 0;
352     conQ = conR = 0;
353   };
354 };
355 
356 
357 
358 //===================================================== DeltaEdgelet_t =========
359 struct DeltaEdgelet_t
360 //!< A piece of a delta graph edge, a single alignment
361 {
362   unsigned char isGOOD : 1;   //!< meets the requirements
363   unsigned char isQLIS : 1;   //!< is part of the query's LIS
364   unsigned char isRLIS : 1;   //!< is part of the reference's LIS
365   unsigned char isGLIS : 1;   //!< is part of the reference/query LIS
366   unsigned char dirR   : 1;   //!< reference match direction
367   unsigned char dirQ   : 1;   //!< query match direction
368 
369   DeltaEdge_t * edge;
370   float idy, sim, stp;        //!< percent identity [0 - 1]
371   long idyc, simc, stpc;      //!< idy, sim, stp counts
372   long loQ, hiQ, loR, hiR;    //!< alignment bounds
373   int frmQ, frmR;             //!< reading frame
374 
375   std::string delta;          //!< delta information
376   std::vector<SNP_t *> snps;  //!< snps for this edgelet
377 
DeltaEdgelet_tDeltaEdgelet_t378   DeltaEdgelet_t ( )
379   {
380     edge = NULL;
381     isGOOD = true;
382     isQLIS = isRLIS = isGLIS = false;
383     dirR = dirQ = FORWARD_DIR;
384     idy = sim = stp = 0;
385     idyc = simc = stpc = 0;
386     loQ = hiQ = loR = hiR = 0;
387     frmQ = frmR = 1;
388   }
389 
~DeltaEdgelet_tDeltaEdgelet_t390   ~DeltaEdgelet_t ( )
391   {
392     std::vector<SNP_t *>::iterator i;
393     for ( i = snps . begin( ); i != snps . end( ); ++ i )
394       delete (*i);
395   }
396 
isNegativeDeltaEdgelet_t397   bool isNegative() const
398   { return ( dirR != dirQ ); }
399 
isPositiveDeltaEdgelet_t400   bool isPositive() const
401   { return ( dirR == dirQ ); }
402 
slopeDeltaEdgelet_t403   int slope() const
404   { return ( dirR == dirQ ? +1 : -1 ); }
405 
loR2QDeltaEdgelet_t406   long loR2Q() const
407   { return ( isPositive() ? loQ : hiQ ); }
408 
hiR2QDeltaEdgelet_t409   long hiR2Q() const
410   { return ( isPositive() ? hiQ : loQ ); }
411 
loQ2RDeltaEdgelet_t412   long loQ2R() const
413   { return ( isPositive() ? loR : hiR ); }
414 
hiQ2RDeltaEdgelet_t415   long hiQ2R() const
416   { return ( isPositive() ? hiR : loR ); }
417 };
418 
419 
420 
421 //===================================================== DeltaEdge_t ============
422 struct DeltaEdge_t
423 //!< A delta graph edge, alignments between a single reference and query
424 {
425   DeltaNode_t * refnode;      //!< the adjacent reference node
426   DeltaNode_t * qrynode;      //!< the adjacent query node
427   std::vector<DeltaEdgelet_t *> edgelets;  //!< the set of individual alignments
428 
DeltaEdge_tDeltaEdge_t429   DeltaEdge_t ( )
430   { refnode = qrynode = NULL; }
431 
~DeltaEdge_tDeltaEdge_t432   ~DeltaEdge_t ( )
433   {
434     std::vector<DeltaEdgelet_t *>::iterator i;
435     for ( i = edgelets . begin( ); i != edgelets . end( ); ++ i )
436       delete (*i);
437   }
438 
439   void build (const DeltaRecord_t & rec);
440 };
441 
442 
443 //===================================================== DeltaNode_t ============
444 struct DeltaNode_t
445 //!< A delta graph node, contains the sequence information
446 {
447   const std::string * id;             //!< the id of the sequence
448   char * seq;                         //!< the DNA sequence
449   long len;              //!< the length of the sequence
450   std::vector<DeltaEdge_t *> edges;   //!< the set of related edges
451 
DeltaNode_tDeltaNode_t452   DeltaNode_t ( )
453   { id = NULL; seq = NULL; len = 0; }
454 
~DeltaNode_tDeltaNode_t455   ~DeltaNode_t ( )
456   { free (seq); } // DeltaGraph_t will take care of destructing the edges
457 };
458 
459 
460 
461 //===================================================== DeltaGraph_t ===========
462 //! \brief A graph of sequences (nodes) and their alignments (edges)
463 //!
464 //!  A bipartite graph with two partite sets, R and Q, where R is the set of
465 //!  reference sequences and Q is the set of query sequences. These nodes are
466 //!  named "DeltaNode_t". We connect a node in R to a node in Q if an alignment
467 //!  is present between the two sequences. The group of all alignments between
468 //!  the two is named "DeltaEdge_t" and a single alignment between the two is
469 //!  named a "DeltaEdgelet_t". Alignment coordinates reference the forward
470 //!  strand and are stored lo before hi.
471 //!
472 //==============================================================================
473 class DeltaGraph_t
474 {
475 public:
476 
477   std::map<std::string, DeltaNode_t> refnodes;
478   //!< the reference graph nodes, 1 per aligned sequence
479 
480   std::map<std::string, DeltaNode_t> qrynodes;
481   //!< the query graph nodes, 1 per aligned sequence
482 
483   std::string refpath;         //!< path of the reference FastA file
484   std::string qrypath;         //!< path of the query FastA file
485   AlignmentType_t datatype;    //!< alignment data type
486 
DeltaGraph_t()487   DeltaGraph_t()
488   { datatype = NULL_DATA; }
489 
~DeltaGraph_t()490   ~DeltaGraph_t ( )
491   { clear(); }
492 
493   void build(const std::string & deltapath, bool getdeltas = true);
494   void clean();
495   void clear();
496   long getNodeCount();
497   long getEdgeCount();
498   long getEdgeletCount();
499 
500   void flagGOOD();
501   void flag1to1(float epsilon = -1, float maxolap = 100.0);
502   void flagMtoM(float epsilon = -1, float maxolap = 100.0);
503   void flagGLIS(float epsilon = -1);
504   void flagQLIS(float epsilon = -1,
505                 float maxolap = 100.0,
506                 bool flagBad = true);
507   void flagRLIS(float epsilon = -1,
508                 float maxolap = 100.0,
509                 bool flagbad = true);
510   void flagScore(long minlen, float minidy);
511   void flagUNIQ(float minuniq);
512 
513   void loadSequences();
514   std::ostream & outputDelta(std::ostream & out);
515 };
516 
517 #endif // #ifndef __DELTA_HH
518