1 /* $Id: restriction.hpp 447585 2014-09-28 23:30:17Z whlavina $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Authors: Josh Cherry
27 *
28 * File Description: Classes for representing and finding restriction sites
29 *
30 */
31
32
33 #ifndef ALGO_SEQUENCE___RESTRICTION__HPP
34 #define ALGO_SEQUENCE___RESTRICTION__HPP
35
36 #include <corelib/ncbistd.hpp>
37 #include <corelib/ncbiobj.hpp>
38 #include <objects/seq/Bioseq.hpp>
39 #include <objmgr/bioseq_handle.hpp>
40 #include <algo/sequence/seq_match.hpp>
41 #include <util/strsearch.hpp>
42
43 BEGIN_NCBI_SCOPE
44 USING_SCOPE(objects);
45
46 ///
47 /// This class represents a particular occurrence of a restriction
48 /// site on a sequence (not to be confused with a CRSpec, which
49 /// represents a *type* of restriction site).
50 /// Contains the locations of beginning and end of recognition site,
51 /// and vectors of cut sites on plus and minus strands.
52 ///
53
54 class NCBI_XALGOSEQ_EXPORT CRSite
55 {
56 public:
57 CRSite(int start, int end, ENa_strand strand = eNa_strand_both);
58
59 // location of recognition sequence
60 void SetStart(const int pos);
61 int GetStart(void) const;
62 void SetEnd(const int pos);
63 int GetEnd(void) const;
64 void SetStrand(ENa_strand strand);
65 ENa_strand GetStrand() const;
66
67 // cleavage locations
68 // 0 is the bond just before the recognition sequence
69 vector<int>& SetPlusCuts(void);
70 const vector<int>& GetPlusCuts(void) const;
71
72 vector<int>& SetMinusCuts(void);
73 const vector<int>& GetMinusCuts(void) const;
74 private:
75 int m_Start;
76 int m_End;
77 ENa_strand m_Strand;
78 vector<int> m_PlusCuts;
79 vector<int> m_MinusCuts;
80 };
81
82 NCBI_XALGOSEQ_EXPORT ostream& operator<<(ostream& os, const CRSite& site);
83
84
85 ///////////////////////////////////////////////////////////
86 ///////////////////// inline methods //////////////////////
87 ///////////////////////////////////////////////////////////
88
89 inline
CRSite(int start,int end,ENa_strand strand)90 CRSite::CRSite(int start, int end, ENa_strand strand)
91 : m_Start(start)
92 , m_End(end)
93 , m_Strand(strand)
94 {
95 }
96
97
98 inline
SetPlusCuts(void)99 vector<int>& CRSite::SetPlusCuts(void)
100 {
101 return m_PlusCuts;
102 }
103
104
105 inline
GetPlusCuts(void) const106 const vector<int>& CRSite::GetPlusCuts(void) const
107 {
108 return m_PlusCuts;
109 }
110
111
112 inline
SetMinusCuts(void)113 vector<int>& CRSite::SetMinusCuts(void)
114 {
115 return m_MinusCuts;
116 }
117
118
119 inline
GetMinusCuts(void) const120 const vector<int>& CRSite::GetMinusCuts(void) const
121 {
122 return m_MinusCuts;
123 }
124
125
126 inline
SetStart(int pos)127 void CRSite::SetStart(int pos)
128 {
129 m_Start = pos;
130 }
131
132
133 inline
GetStart(void) const134 int CRSite::GetStart(void) const
135 {
136 return m_Start;
137 }
138
139
140 inline
SetEnd(int pos)141 void CRSite::SetEnd(int pos)
142 {
143 m_End = pos;
144 }
145
146
147 inline
GetEnd(void) const148 int CRSite::GetEnd(void) const
149 {
150 return m_End;
151 }
152
153
154 inline
SetStrand(ENa_strand strand)155 void CRSite::SetStrand(ENa_strand strand)
156 {
157 m_Strand = strand;
158 }
159
160
161 inline
GetStrand(void) const162 ENa_strand CRSite::GetStrand(void) const
163 {
164 return m_Strand;
165 }
166
167
168
169 ///
170 /// This class represents a restriction enzyme specificity,
171 /// i.e., a sequence recognition pattern and vectors of cleavage
172 /// sites on the two strands.
173 /// Some known enzymes (e.g., BaeI) have two cleavage sites on each
174 /// strand. Some will be represented as having zero because the
175 /// cut locations are unknown.
176 /// An enzyme may have more than one specificity (TaqII).
177 ///
178
179 class NCBI_XALGOSEQ_EXPORT CRSpec
180 {
181 public:
182 // recognition sequence
183 void SetSeq(const string& s);
184 string& SetSeq(void);
185 const string& GetSeq(void) const;
186
187 // cleavage locations
188 // 0 is the bond just before the recognition sequence
189 vector<int>& SetPlusCuts(void);
190 const vector<int>& GetPlusCuts(void) const;
191 vector<int>& SetMinusCuts(void);
192 const vector<int>& GetMinusCuts(void) const;
193
194 // compare
operator ==(const CRSpec & rhs) const195 bool operator==(const CRSpec& rhs) const {
196 return m_Seq == rhs.m_Seq
197 && m_PlusCuts == rhs.m_PlusCuts
198 && m_MinusCuts == rhs.m_MinusCuts;
199 }
operator !=(const CRSpec & rhs) const200 bool operator!=(const CRSpec& rhs) const {
201 return !(*this == rhs);
202 }
203 bool operator<(const CRSpec& rhs) const;
204
205 // reset everything
206 void Reset(void);
207 private:
208 string m_Seq;
209 vector<int> m_PlusCuts;
210 vector<int> m_MinusCuts;
211 };
212
213
214 ///////////////////////////////////////////////////////////
215 ///////////////////// inline methods //////////////////////
216 ///////////////////////////////////////////////////////////
217
218
219 inline
SetSeq(const string & s)220 void CRSpec::SetSeq(const string& s)
221 {
222 m_Seq = s;
223 }
224
225
226 inline
SetSeq(void)227 string& CRSpec::SetSeq(void)
228 {
229 return m_Seq;
230 }
231
232
233 inline
GetSeq(void) const234 const string& CRSpec::GetSeq(void) const
235 {
236 return m_Seq;
237 }
238
239
240 inline
SetPlusCuts(void)241 vector<int>& CRSpec::SetPlusCuts(void)
242 {
243 return m_PlusCuts;
244 }
245
246
247 inline
GetPlusCuts(void) const248 const vector<int>& CRSpec::GetPlusCuts(void) const
249 {
250 return m_PlusCuts;
251 }
252
253
254 inline
SetMinusCuts(void)255 vector<int>& CRSpec::SetMinusCuts(void)
256 {
257 return m_MinusCuts;
258 }
259
260
261 inline
GetMinusCuts(void) const262 const vector<int>& CRSpec::GetMinusCuts(void) const
263 {
264 return m_MinusCuts;
265 }
266
267
268 ///
269 /// This class represents a restriction enzyme
270 /// (an enzyme name and a vector of cleavage specificities)
271 ///
272
273 class NCBI_XALGOSEQ_EXPORT CREnzyme
274 {
275 public:
276 typedef vector<CREnzyme> TEnzymes;
277
278 // name of enzyme
279 void SetName(const string& s);
280 string& SetName(void);
281 const string& GetName(void) const;
282
283 // cleavage specificities
284 // (usually just one, but TaqII has two)
285 vector<CRSpec>& SetSpecs(void);
286 const vector<CRSpec>& GetSpecs(void) const;
287
288 void SetPrototype(const string& s = kEmptyStr);
289 const string& GetPrototype() const;
290 bool IsPrototype() const;
291
292 vector<string>& SetIsoschizomers(void);
293 const vector<string>& GetIsoschizomers(void) const;
294
295 // reset everything
296 void Reset(void);
297
298 // Given a vector of CREnzyme, lump together all
299 // enzymes with identical specificities.
300 // The cleavage sites must be the same for specificities
301 // to be considered indentical (in addition to the
302 // recognition sequenence).
303 static void CombineIsoschizomers(TEnzymes& enzymes);
304
305 private:
306 string m_Name;
307 vector<string> m_Isoschizomers;
308
309 /// Prototype name, or none if this enzyme is a prototype.
310 string m_Prototype;
311 vector<CRSpec> m_Specs;
312 };
313
314
315 ///////////////////////////////////////////////////////////
316 ///////////////////// inline methods //////////////////////
317 ///////////////////////////////////////////////////////////
318
319 inline
SetName(const string & s)320 void CREnzyme::SetName(const string& s)
321 {
322 m_Name = s;
323 }
324
325
326 inline
SetName(void)327 string& CREnzyme::SetName(void)
328 {
329 return m_Name;
330 }
331
332
333 inline
GetName(void) const334 const string& CREnzyme::GetName(void) const
335 {
336 return m_Name;
337 }
338
339
340 inline
SetSpecs(void)341 vector<CRSpec>& CREnzyme::SetSpecs(void)
342 {
343 return m_Specs;
344 }
345
346
347 inline
GetSpecs(void) const348 const vector<CRSpec>& CREnzyme::GetSpecs(void) const
349 {
350 return m_Specs;
351 }
352
353
354 inline
GetPrototype(void) const355 const string& CREnzyme::GetPrototype(void) const
356 {
357 return m_Prototype;
358 }
359
360
361 inline
SetPrototype(const string & s)362 void CREnzyme::SetPrototype(const string& s)
363 {
364 m_Prototype = s;
365 }
366
367
368 inline
IsPrototype(void) const369 bool CREnzyme::IsPrototype(void) const
370 {
371 return m_Prototype.empty();
372 }
373
374
375 inline
SetIsoschizomers(void)376 vector<string>& CREnzyme::SetIsoschizomers(void)
377 {
378 return m_Isoschizomers;
379 }
380
381
382 inline
GetIsoschizomers(void) const383 const vector<string>& CREnzyme::GetIsoschizomers(void) const
384 {
385 return m_Isoschizomers;
386 }
387
388
389 ///
390 /// This class provides utilities for dealing with
391 /// REBASE format restriction enzyme databases.
392 ///
393
394 class NCBI_XALGOSEQ_EXPORT CRebase
395 {
396 public:
397 typedef CREnzyme::TEnzymes TEnzymes;
398
399 // build a CRSpec based on a string from REBASE
400 static CRSpec MakeRSpec(const string& site);
401
402 // build a CREnzyme based on two strings from REBASE
403 static CREnzyme MakeREnzyme(const string& name, const string& sites);
404
405 // whether to read all enzymes, only those commercially
406 // available, or the prototype for every recognition site
407 enum EEnzymesToLoad {
408 eAll,
409 eCommercial,
410 ePrototype
411 };
412 // read a REBASE file in "NAR" format
413 static void ReadNARFormat(istream& input,
414 TEnzymes& enzymes,
415 enum EEnzymesToLoad which);
416
417 static string GetDefaultDataPath();
418
419 private:
420 static void x_ParseCutPair(const string& s, int& plus_cut, int& minus_cut);
421 };
422
423
424 ///
425 /// This class represents the results of a search for sites
426 /// of a particular enzyme.
427 /// It merely packages an enzyme name, a vector of
428 /// definite sites, and a vector of possible sites
429 ///
430
431 class CREnzResult : public CObject
432 {
433 public:
CREnzResult(const string & enzyme_name)434 CREnzResult(const string& enzyme_name) : m_EnzymeName(enzyme_name) {}
435 CREnzResult(const string& enzyme_name,
436 const vector<CRSite>& definite_sites,
437 const vector<CRSite>& possible_sites);
438 // member access functions
GetEnzymeName(void) const439 const string& GetEnzymeName(void) const {return m_EnzymeName;}
SetDefiniteSites(void)440 vector<CRSite>& SetDefiniteSites(void) {return m_DefiniteSites;}
GetDefiniteSites(void) const441 const vector<CRSite>& GetDefiniteSites(void) const
442 {
443 return m_DefiniteSites;
444 }
SetPossibleSites(void)445 vector<CRSite>& SetPossibleSites(void) {return m_PossibleSites;}
GetPossibleSites(void) const446 const vector<CRSite>& GetPossibleSites(void) const
447 {
448 return m_PossibleSites;
449 }
450
451 private:
452 string m_EnzymeName;
453 vector<CRSite> m_DefiniteSites;
454 vector<CRSite> m_PossibleSites;
455 };
456
457 NCBI_XALGOSEQ_EXPORT ostream& operator<<(ostream& os, const CREnzResult& er);
458
459
460 ///////////////////////////////////////////////////////////
461 ///////////////////// inline methods //////////////////////
462 ///////////////////////////////////////////////////////////
463
464 inline
CREnzResult(const string & enzyme_name,const vector<CRSite> & definite_sites,const vector<CRSite> & possible_sites)465 CREnzResult::CREnzResult(const string& enzyme_name,
466 const vector<CRSite>& definite_sites,
467 const vector<CRSite>& possible_sites)
468 {
469 m_EnzymeName = enzyme_name;
470 m_DefiniteSites = definite_sites;
471 m_PossibleSites = possible_sites;
472 }
473
474
475 /// this class contains the static member functions Find,
476 /// which find restriction sites in a sequence
477 class NCBI_XALGOSEQ_EXPORT CFindRSites
478 {
479 public:
480 enum EFlags {
481 /// Lump together all enzymes with identical specificities.
482 fFindIsoschizomers = (1 << 1),
483 fCombineIsoschizomers = (1 << 2),
484 fSortByNumberOfSites = (1 << 3),
485 fSplitAnnotsByEnzyme = (1 << 4),
486 fFindPossibleSites = (1 << 5),
487 fDefault = fCombineIsoschizomers | fSortByNumberOfSites
488 };
489 typedef unsigned int TFlags;
490
491 typedef CREnzyme::TEnzymes TEnzymes;
492 typedef CBioseq::TAnnot TAnnot;
493
494 CFindRSites(const string& refile = kEmptyStr,
495 CRebase::EEnzymesToLoad which_enzymes = CRebase::eAll,
496 TFlags flags = fDefault);
497
498 const TEnzymes& GetEnzymes();
499 TEnzymes& SetEnzymes();
500
501 TAnnot GetAnnot(CScope& scope, const CSeq_loc& loc) const;
502 TAnnot GetAnnot(CBioseq_Handle bsh) const;
503
504 static void Find(const string& seq,
505 const TEnzymes& enzymes,
506 vector<CRef<CREnzResult> >& results,
507 TFlags flags = 0);
508 static void Find(const vector<char>& seq,
509 const TEnzymes& enzymes,
510 vector<CRef<CREnzResult> >& results,
511 TFlags flags = 0);
512 static void Find(const CSeqVector& seq,
513 const TEnzymes& enzymes,
514 vector<CRef<CREnzResult> >& results,
515 TFlags flags = 0);
516
517 private:
518 void x_LoadREnzymeData(const string& refile,
519 CRebase::EEnzymesToLoad which_enzymes);
520
521 static void x_ExpandRecursion(string& s, unsigned int pos,
522 CTextFsm<int>& fsm, int match_value);
523 static void x_AddPattern(const string& pat, CTextFsm<int>& fsm,
524 int match_value);
525 static bool x_IsAmbig(char nuc);
526
527 template<class Seq>
528 friend void x_FindRSite(const Seq& seq, const TEnzymes& enzymes,
529 vector<CRef<CREnzResult> >& results,
530 CFindRSites::TFlags);
531
532 TFlags m_Flags;
533 TEnzymes m_Enzymes;
534 };
535
536
537
538 END_NCBI_SCOPE
539
540 #endif // ALGO_SEQUENCE___RESTRICTION__HPP
541