1 /*  $Id: restriction.hpp 447585 2014-09-28 23:30:17Z whlavina $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors:  Josh Cherry
27  *
28  * File Description:  Classes for representing and finding restriction sites
29  *
30  */
31 
32 
33 #ifndef ALGO_SEQUENCE___RESTRICTION__HPP
34 #define ALGO_SEQUENCE___RESTRICTION__HPP
35 
36 #include <corelib/ncbistd.hpp>
37 #include <corelib/ncbiobj.hpp>
38 #include <objects/seq/Bioseq.hpp>
39 #include <objmgr/bioseq_handle.hpp>
40 #include <algo/sequence/seq_match.hpp>
41 #include <util/strsearch.hpp>
42 
43 BEGIN_NCBI_SCOPE
44 USING_SCOPE(objects);
45 
46 ///
47 /// This class represents a particular occurrence of a restriction
48 /// site on a sequence (not to be confused with a CRSpec, which
49 /// represents a *type* of restriction site).
50 /// Contains the locations of beginning and end of recognition site,
51 /// and vectors of cut sites on plus and minus strands.
52 ///
53 
54 class NCBI_XALGOSEQ_EXPORT CRSite
55 {
56 public:
57     CRSite(int start, int end, ENa_strand strand = eNa_strand_both);
58 
59     // location of recognition sequence
60     void SetStart(const int pos);
61     int GetStart(void) const;
62     void SetEnd(const int pos);
63     int GetEnd(void) const;
64     void SetStrand(ENa_strand strand);
65     ENa_strand GetStrand() const;
66 
67     // cleavage locations
68     // 0 is the bond just before the recognition sequence
69     vector<int>&       SetPlusCuts(void);
70     const vector<int>& GetPlusCuts(void) const;
71 
72     vector<int>&       SetMinusCuts(void);
73     const vector<int>& GetMinusCuts(void) const;
74 private:
75     int m_Start;
76     int m_End;
77     ENa_strand m_Strand;
78     vector<int> m_PlusCuts;
79     vector<int> m_MinusCuts;
80 };
81 
82 NCBI_XALGOSEQ_EXPORT ostream& operator<<(ostream& os, const CRSite& site);
83 
84 
85 ///////////////////////////////////////////////////////////
86 ///////////////////// inline methods //////////////////////
87 ///////////////////////////////////////////////////////////
88 
89 inline
CRSite(int start,int end,ENa_strand strand)90 CRSite::CRSite(int start, int end, ENa_strand strand)
91     : m_Start(start)
92     , m_End(end)
93     , m_Strand(strand)
94 {
95 }
96 
97 
98 inline
SetPlusCuts(void)99 vector<int>& CRSite::SetPlusCuts(void)
100 {
101     return m_PlusCuts;
102 }
103 
104 
105 inline
GetPlusCuts(void) const106 const vector<int>& CRSite::GetPlusCuts(void) const
107 {
108     return m_PlusCuts;
109 }
110 
111 
112 inline
SetMinusCuts(void)113 vector<int>& CRSite::SetMinusCuts(void)
114 {
115     return m_MinusCuts;
116 }
117 
118 
119 inline
GetMinusCuts(void) const120 const vector<int>& CRSite::GetMinusCuts(void) const
121 {
122     return m_MinusCuts;
123 }
124 
125 
126 inline
SetStart(int pos)127 void CRSite::SetStart(int pos)
128 {
129     m_Start = pos;
130 }
131 
132 
133 inline
GetStart(void) const134 int CRSite::GetStart(void) const
135 {
136     return m_Start;
137 }
138 
139 
140 inline
SetEnd(int pos)141 void CRSite::SetEnd(int pos)
142 {
143     m_End = pos;
144 }
145 
146 
147 inline
GetEnd(void) const148 int CRSite::GetEnd(void) const
149 {
150     return m_End;
151 }
152 
153 
154 inline
SetStrand(ENa_strand strand)155 void CRSite::SetStrand(ENa_strand strand)
156 {
157     m_Strand = strand;
158 }
159 
160 
161 inline
GetStrand(void) const162 ENa_strand CRSite::GetStrand(void) const
163 {
164     return m_Strand;
165 }
166 
167 
168 
169 ///
170 /// This class represents a restriction enzyme specificity,
171 /// i.e., a sequence recognition pattern and vectors of cleavage
172 /// sites on the two strands.
173 /// Some known enzymes (e.g., BaeI) have two cleavage sites on each
174 /// strand.  Some will be represented as having zero because the
175 /// cut locations are unknown.
176 /// An enzyme may have more than one specificity (TaqII).
177 ///
178 
179 class NCBI_XALGOSEQ_EXPORT CRSpec
180 {
181 public:
182     // recognition sequence
183     void SetSeq(const string& s);
184     string& SetSeq(void);
185     const string& GetSeq(void) const;
186 
187     // cleavage locations
188     // 0 is the bond just before the recognition sequence
189     vector<int>&       SetPlusCuts(void);
190     const vector<int>& GetPlusCuts(void) const;
191     vector<int>&       SetMinusCuts(void);
192     const vector<int>& GetMinusCuts(void) const;
193 
194     // compare
operator ==(const CRSpec & rhs) const195     bool operator==(const CRSpec& rhs) const {
196         return m_Seq == rhs.m_Seq
197             && m_PlusCuts == rhs.m_PlusCuts
198             && m_MinusCuts == rhs.m_MinusCuts;
199     }
operator !=(const CRSpec & rhs) const200     bool operator!=(const CRSpec& rhs) const {
201         return !(*this == rhs);
202     }
203     bool operator<(const CRSpec& rhs) const;
204 
205     // reset everything
206     void Reset(void);
207 private:
208     string m_Seq;
209     vector<int> m_PlusCuts;
210     vector<int> m_MinusCuts;
211 };
212 
213 
214 ///////////////////////////////////////////////////////////
215 ///////////////////// inline methods //////////////////////
216 ///////////////////////////////////////////////////////////
217 
218 
219 inline
SetSeq(const string & s)220 void CRSpec::SetSeq(const string& s)
221 {
222     m_Seq = s;
223 }
224 
225 
226 inline
SetSeq(void)227 string& CRSpec::SetSeq(void)
228 {
229     return m_Seq;
230 }
231 
232 
233 inline
GetSeq(void) const234 const string& CRSpec::GetSeq(void) const
235 {
236     return m_Seq;
237 }
238 
239 
240 inline
SetPlusCuts(void)241 vector<int>& CRSpec::SetPlusCuts(void)
242 {
243     return m_PlusCuts;
244 }
245 
246 
247 inline
GetPlusCuts(void) const248 const vector<int>& CRSpec::GetPlusCuts(void) const
249 {
250     return m_PlusCuts;
251 }
252 
253 
254 inline
SetMinusCuts(void)255 vector<int>& CRSpec::SetMinusCuts(void)
256 {
257     return m_MinusCuts;
258 }
259 
260 
261 inline
GetMinusCuts(void) const262 const vector<int>& CRSpec::GetMinusCuts(void) const
263 {
264     return m_MinusCuts;
265 }
266 
267 
268 ///
269 /// This class represents a restriction enzyme
270 /// (an enzyme name and a vector of cleavage specificities)
271 ///
272 
273 class NCBI_XALGOSEQ_EXPORT CREnzyme
274 {
275 public:
276     typedef vector<CREnzyme> TEnzymes;
277 
278     // name of enzyme
279     void SetName(const string& s);
280     string& SetName(void);
281     const string& GetName(void) const;
282 
283     // cleavage specificities
284     // (usually just one, but TaqII has two)
285     vector<CRSpec>& SetSpecs(void);
286     const vector<CRSpec>& GetSpecs(void) const;
287 
288     void SetPrototype(const string& s = kEmptyStr);
289     const string& GetPrototype() const;
290     bool IsPrototype() const;
291 
292     vector<string>& SetIsoschizomers(void);
293     const vector<string>& GetIsoschizomers(void) const;
294 
295     // reset everything
296     void Reset(void);
297 
298     // Given a vector of CREnzyme, lump together all
299     // enzymes with identical specificities.
300     // The cleavage sites must be the same for specificities
301     // to be considered indentical (in addition to the
302     // recognition sequenence).
303     static void CombineIsoschizomers(TEnzymes& enzymes);
304 
305 private:
306     string m_Name;
307     vector<string> m_Isoschizomers;
308 
309     /// Prototype name, or none if this enzyme is a prototype.
310     string m_Prototype;
311     vector<CRSpec> m_Specs;
312 };
313 
314 
315 ///////////////////////////////////////////////////////////
316 ///////////////////// inline methods //////////////////////
317 ///////////////////////////////////////////////////////////
318 
319 inline
SetName(const string & s)320 void CREnzyme::SetName(const string& s)
321 {
322     m_Name = s;
323 }
324 
325 
326 inline
SetName(void)327 string& CREnzyme::SetName(void)
328 {
329     return m_Name;
330 }
331 
332 
333 inline
GetName(void) const334 const string& CREnzyme::GetName(void) const
335 {
336     return m_Name;
337 }
338 
339 
340 inline
SetSpecs(void)341 vector<CRSpec>& CREnzyme::SetSpecs(void)
342 {
343     return m_Specs;
344 }
345 
346 
347 inline
GetSpecs(void) const348 const vector<CRSpec>& CREnzyme::GetSpecs(void) const
349 {
350     return m_Specs;
351 }
352 
353 
354 inline
GetPrototype(void) const355 const string& CREnzyme::GetPrototype(void) const
356 {
357     return m_Prototype;
358 }
359 
360 
361 inline
SetPrototype(const string & s)362 void CREnzyme::SetPrototype(const string& s)
363 {
364     m_Prototype = s;
365 }
366 
367 
368 inline
IsPrototype(void) const369 bool CREnzyme::IsPrototype(void) const
370 {
371     return m_Prototype.empty();
372 }
373 
374 
375 inline
SetIsoschizomers(void)376 vector<string>& CREnzyme::SetIsoschizomers(void)
377 {
378     return m_Isoschizomers;
379 }
380 
381 
382 inline
GetIsoschizomers(void) const383 const vector<string>& CREnzyme::GetIsoschizomers(void) const
384 {
385     return m_Isoschizomers;
386 }
387 
388 
389 ///
390 /// This class provides utilities for dealing with
391 /// REBASE format restriction enzyme databases.
392 ///
393 
394 class NCBI_XALGOSEQ_EXPORT CRebase
395 {
396 public:
397     typedef CREnzyme::TEnzymes TEnzymes;
398 
399     // build a CRSpec based on a string from REBASE
400     static CRSpec MakeRSpec(const string& site);
401 
402     // build a CREnzyme based on two strings from REBASE
403     static CREnzyme MakeREnzyme(const string& name, const string& sites);
404 
405     // whether to read all enzymes, only those commercially
406     // available, or the prototype for every recognition site
407     enum EEnzymesToLoad {
408         eAll,
409         eCommercial,
410         ePrototype
411     };
412     // read a REBASE file in "NAR" format
413     static void ReadNARFormat(istream& input,
414                               TEnzymes& enzymes,
415                               enum EEnzymesToLoad which);
416 
417     static string GetDefaultDataPath();
418 
419 private:
420     static void x_ParseCutPair(const string& s, int& plus_cut, int& minus_cut);
421 };
422 
423 
424 ///
425 /// This class represents the results of a search for sites
426 /// of a particular enzyme.
427 /// It merely packages an enzyme name, a vector of
428 /// definite sites, and a vector of possible sites
429 ///
430 
431 class CREnzResult : public CObject
432 {
433 public:
CREnzResult(const string & enzyme_name)434     CREnzResult(const string& enzyme_name) : m_EnzymeName(enzyme_name) {}
435     CREnzResult(const string& enzyme_name,
436                 const vector<CRSite>& definite_sites,
437                 const vector<CRSite>& possible_sites);
438     // member access functions
GetEnzymeName(void) const439     const string& GetEnzymeName(void) const {return m_EnzymeName;}
SetDefiniteSites(void)440     vector<CRSite>& SetDefiniteSites(void) {return m_DefiniteSites;}
GetDefiniteSites(void) const441     const vector<CRSite>& GetDefiniteSites(void) const
442     {
443         return m_DefiniteSites;
444     }
SetPossibleSites(void)445     vector<CRSite>& SetPossibleSites(void) {return m_PossibleSites;}
GetPossibleSites(void) const446     const vector<CRSite>& GetPossibleSites(void) const
447     {
448         return m_PossibleSites;
449     }
450 
451 private:
452     string m_EnzymeName;
453     vector<CRSite> m_DefiniteSites;
454     vector<CRSite> m_PossibleSites;
455 };
456 
457 NCBI_XALGOSEQ_EXPORT ostream& operator<<(ostream& os, const CREnzResult& er);
458 
459 
460 ///////////////////////////////////////////////////////////
461 ///////////////////// inline methods //////////////////////
462 ///////////////////////////////////////////////////////////
463 
464 inline
CREnzResult(const string & enzyme_name,const vector<CRSite> & definite_sites,const vector<CRSite> & possible_sites)465 CREnzResult::CREnzResult(const string& enzyme_name,
466                          const vector<CRSite>& definite_sites,
467                          const vector<CRSite>& possible_sites)
468 {
469     m_EnzymeName = enzyme_name;
470     m_DefiniteSites = definite_sites;
471     m_PossibleSites = possible_sites;
472 }
473 
474 
475 /// this class contains the static member functions Find,
476 /// which find restriction sites in a sequence
477 class NCBI_XALGOSEQ_EXPORT CFindRSites
478 {
479 public:
480     enum EFlags {
481         /// Lump together all enzymes with identical specificities.
482         fFindIsoschizomers    = (1 << 1),
483         fCombineIsoschizomers = (1 << 2),
484         fSortByNumberOfSites  = (1 << 3),
485         fSplitAnnotsByEnzyme  = (1 << 4),
486         fFindPossibleSites    = (1 << 5),
487         fDefault = fCombineIsoschizomers | fSortByNumberOfSites
488     };
489     typedef unsigned int TFlags;
490 
491     typedef CREnzyme::TEnzymes TEnzymes;
492     typedef CBioseq::TAnnot TAnnot;
493 
494     CFindRSites(const string& refile = kEmptyStr,
495                 CRebase::EEnzymesToLoad which_enzymes = CRebase::eAll,
496                 TFlags flags = fDefault);
497 
498     const TEnzymes& GetEnzymes();
499     TEnzymes& SetEnzymes();
500 
501     TAnnot GetAnnot(CScope& scope, const CSeq_loc& loc) const;
502     TAnnot GetAnnot(CBioseq_Handle bsh) const;
503 
504     static void Find(const string& seq,
505                      const TEnzymes& enzymes,
506                      vector<CRef<CREnzResult> >& results,
507                      TFlags flags = 0);
508     static void Find(const vector<char>& seq,
509                      const TEnzymes& enzymes,
510                      vector<CRef<CREnzResult> >& results,
511                      TFlags flags = 0);
512     static void Find(const CSeqVector& seq,
513                      const TEnzymes& enzymes,
514                      vector<CRef<CREnzResult> >& results,
515                      TFlags flags = 0);
516 
517 private:
518     void x_LoadREnzymeData(const string& refile,
519                            CRebase::EEnzymesToLoad which_enzymes);
520 
521     static void x_ExpandRecursion(string& s, unsigned int pos,
522                                   CTextFsm<int>& fsm, int match_value);
523     static void x_AddPattern(const string& pat, CTextFsm<int>& fsm,
524                              int match_value);
525     static bool x_IsAmbig(char nuc);
526 
527     template<class Seq>
528     friend void x_FindRSite(const Seq& seq, const TEnzymes& enzymes,
529                             vector<CRef<CREnzResult> >& results,
530                             CFindRSites::TFlags);
531 
532     TFlags m_Flags;
533     TEnzymes m_Enzymes;
534 };
535 
536 
537 
538 END_NCBI_SCOPE
539 
540 #endif   // ALGO_SEQUENCE___RESTRICTION__HPP
541