1 /* $Id: restriction.cpp 623045 2021-01-07 14:09:35Z ivanov $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Authors: Josh Cherry
27 *
28 * File Description: Classes for representing and finding restriction sites
29 *
30 */
31
32
33 #include <ncbi_pch.hpp>
34 #include <algorithm>
35 #include <corelib/ncbistd.hpp>
36
37 #include <objects/general/Dbtag.hpp>
38 #include <objects/general/Object_id.hpp>
39 #include <objects/seqfeat/Rsite_ref.hpp>
40 #include <objects/seqloc/Seq_point.hpp>
41 #include <objects/seq/Annot_descr.hpp>
42 #include <objects/seq/Annotdesc.hpp>
43
44 #include <objmgr/seq_vector.hpp>
45 #include <objmgr/util/sequence.hpp>
46
47 #include <algo/sequence/restriction.hpp>
48
49
50 BEGIN_NCBI_SCOPE
51 USING_SCOPE(objects);
52
53 NCBI_PARAM_DECL(string, RESTRICTION_SITES, REBASE);
54 NCBI_PARAM_DEF (string, RESTRICTION_SITES, REBASE, "");
55 typedef NCBI_PARAM_TYPE(RESTRICTION_SITES, REBASE) TRebaseData;
56
57
operator <(const CRSpec & rhs) const58 bool CRSpec::operator<(const CRSpec& rhs) const
59 {
60 if (GetSeq() != rhs.GetSeq()) {
61 return GetSeq() < rhs.GetSeq();
62 }
63 // otherwise sequences identical
64 if (GetPlusCuts() != rhs.GetPlusCuts()) {
65 return GetPlusCuts() < rhs.GetPlusCuts();
66 }
67 if (GetMinusCuts() != rhs.GetMinusCuts()) {
68 return GetMinusCuts() < rhs.GetMinusCuts();
69 }
70 // otherwise arguments are equal, and < is false
71 return false;
72 }
73
74
Reset(void)75 void CREnzyme::Reset(void)
76 {
77 m_Name.erase();
78 m_Specs.clear();
79 }
80
81
82 // helper functor for sorting enzymes by specificity
83 struct SCompareSpecs
84 {
operator ()SCompareSpecs85 bool operator()(const CREnzyme& lhs, const CREnzyme& rhs)
86 {
87 return lhs.GetSpecs() < rhs.GetSpecs();
88 }
89 };
90
91
CombineIsoschizomers(TEnzymes & enzymes)92 void CREnzyme::CombineIsoschizomers(TEnzymes& enzymes)
93 {
94 stable_sort(enzymes.begin(), enzymes.end(), SCompareSpecs());
95 TEnzymes result;
96 ITERATE (TEnzymes, enzyme, enzymes) {
97 if (enzyme != enzymes.begin() &&
98 enzyme->GetSpecs() == result.back().GetSpecs()) {
99 result.back().SetName() += "/";
100 result.back().SetName() += enzyme->GetName();
101 } else {
102 result.push_back(*enzyme);
103 result.back().SetPrototype();
104 }
105 }
106 swap(enzymes, result);
107 }
108
109
Reset(void)110 void CRSpec::Reset(void)
111 {
112 m_Seq.erase();
113 m_PlusCuts.clear();
114 m_MinusCuts.clear();
115 }
116
117
operator <<(ostream & os,const CRSite & site)118 ostream& operator<<(ostream& os, const CRSite& site)
119 {
120 os << "Recog. site: " << site.GetStart() << '-'
121 << site.GetEnd() << endl;
122 os << "Plus strand cuts: ";
123 string s;
124 ITERATE (vector<int>, cut, site.GetPlusCuts()) {
125 if (!s.empty()) {
126 s += " ,";
127 }
128 s += NStr::IntToString(*cut);
129 }
130 os << s << endl;
131
132 os << "Minus strand cuts: ";
133 s.erase();
134 ITERATE (vector<int>, cut, site.GetMinusCuts()) {
135 if (!s.empty()) {
136 s += " ,";
137 }
138 s += NStr::IntToString(*cut);
139 }
140 os << s << endl;
141
142 return os;
143 }
144
145
MakeRSpec(const string & site)146 CRSpec CRebase::MakeRSpec(const string& site)
147 {
148 // site contains a string such as
149 // GACGTC, GACGT^C, GCAGC(8/12), or (8/13)GAGNNNNNCTC(13/8)
150
151 CRSpec spec;
152
153 int plus_cut, minus_cut;
154
155 string s = site;
156 if (s[0] == '(') {
157 string::size_type idx = s.find_first_of(")");
158 if (idx == std::string::npos) {
159 throw runtime_error(string("Error parsing site ")
160 + site);
161 }
162 x_ParseCutPair(s.substr(0, idx + 1), plus_cut, minus_cut);
163 s.erase(0, idx + 1);
164 spec.SetPlusCuts().push_back(-plus_cut);
165 spec.SetMinusCuts().push_back(-minus_cut);
166 }
167 if (s[s.length() - 1] == ')') {
168 string::size_type idx = s.find_last_of("(");
169 if (idx == std::string::npos) {
170 throw runtime_error(string("Error parsing site ")
171 + site);
172 }
173 x_ParseCutPair(s.substr(idx), plus_cut, minus_cut);
174 s.erase(idx);
175 spec.SetPlusCuts().push_back(plus_cut + s.length());
176 spec.SetMinusCuts().push_back(minus_cut + s.length());
177 }
178 for (unsigned int i = 0; i < s.length(); i++) {
179 if (s[i] == '^') {
180
181 // This should be simple. If we have
182 // G^GATCC, the cut on the plus strand is
183 // at 1, and on the minus it's at the
184 // symmetric position, 5.
185 // The complication is that TspRI
186 // is given as CASTGNN^, not NNCASTGNN^.
187 // Someone someday might write NNCASTGNN^,
188 // and something like ^NNGATC might arise,
189 // so code defensively.
190
191 s.erase(i, 1);
192 int plus_cut = i;
193
194 // this is the slightly complicated bit
195 // trim any leading and trailing N's;
196 // in case leading N's removed, adjust plus_cut accordingly
197 string::size_type idx = s.find_first_not_of("N");
198 if (idx == string::npos) {
199 // bizarre situation; all N's (possibly empty)
200 s.erase();
201 plus_cut = 0;
202 } else {
203 s.erase(0, idx);
204 plus_cut -= idx;
205 }
206 idx = s.find_last_not_of("N");
207 s.erase(idx + 1);
208
209 // plus strand cut
210 spec.SetPlusCuts().push_back(plus_cut);
211 // symmetric cut on minus strand
212 spec.SetMinusCuts().push_back(s.length() - plus_cut);
213 break; // there better be just one '^'
214 }
215 }
216 spec.SetSeq(s);
217 return spec;
218 }
219
220
MakeREnzyme(const string & name,const string & sites)221 CREnzyme CRebase::MakeREnzyme(const string& name, const string& sites)
222 {
223
224 CREnzyme re;
225
226 re.SetName(name);
227
228 vector<string> site_vec;
229 NStr::Split(sites, ",", site_vec);
230 ITERATE (vector<string>, iter, site_vec) {
231 CRSpec spec = CRebase::MakeRSpec(*iter);
232 re.SetSpecs().push_back(spec);
233 }
234
235 return re;
236 }
237
238
GetDefaultDataPath()239 string CRebase::GetDefaultDataPath()
240 {
241 return TRebaseData::GetDefault();
242 }
243
244
x_ParseCutPair(const string & s,int & plus_cut,int & minus_cut)245 void CRebase::x_ParseCutPair(const string& s, int& plus_cut, int& minus_cut)
246 {
247 // s should look like "(8/13)"; plus_cut gets set to 8 and minus_cut to 13
248 list<string> l;
249 NStr::Split(s.substr(1, s.length() - 2), "/", l, NStr::fSplit_Tokenize);
250 if (l.size() != 2) {
251 throw runtime_error(string("Couldn't parse cut locations ")
252 + s);
253 }
254 plus_cut = NStr::StringToInt(l.front());
255 minus_cut = NStr::StringToInt(l.back());
256 }
257
258
ReadNARFormat(istream & input,TEnzymes & enzymes,enum EEnzymesToLoad which)259 void CRebase::ReadNARFormat(istream& input,
260 TEnzymes& enzymes,
261 enum EEnzymesToLoad which)
262 {
263 string line;
264 CREnzyme enzyme;
265 string name;
266 TEnzymes::size_type prototype_idx(0);
267
268 while (getline(input, line)) {
269 vector<string> fields;
270 NStr::Split(line, "\t", fields);
271 // the lines we're interested in have more than one field
272 if (fields.size() < 2) {
273 continue;
274 }
275 // name of enzyme is in field one or two (field zero is empty)
276 name = fields[1];
277 bool is_prototype(true);
278 if (name == " ") {
279 if (which == ePrototype) {
280 continue; // skip this non-protype
281 }
282 is_prototype = false;
283 name = fields[2];
284 }
285 if (which == eCommercial && fields[5].empty()) {
286 continue; // skip this commercially unavailable enzyme
287 }
288 string sites = fields[3]; // this contains a comma-separted list of sites,
289 // usually just one (but TaqII has two)
290 enzyme = MakeREnzyme(name, sites);
291 enzymes.push_back(enzyme);
292
293 // Link isoschizomers with their prototypes.
294 if (is_prototype) {
295 prototype_idx = enzymes.size();
296 } else if (prototype_idx) {
297 CREnzyme& prototype = enzymes[prototype_idx - 1];
298 enzyme.SetPrototype(prototype.GetName());
299 prototype.SetIsoschizomers().push_back(name);
300 }
301 }
302 }
303
304
operator <<(ostream & os,const CREnzResult & er)305 ostream& operator<<(ostream& os, const CREnzResult& er)
306 {
307 os << "Enzyme: " << er.GetEnzymeName() << endl;
308 os << er.GetDefiniteSites().size() << " definite sites:" << endl;
309 ITERATE (vector<CRSite>, site, er.GetDefiniteSites()) {
310 os << *site;
311 }
312 os << er.GetPossibleSites().size() << " possible sites:" << endl;
313 ITERATE (vector<CRSite>, site, er.GetPossibleSites()) {
314 os << *site;
315 }
316 return os;
317 }
318
319
320 struct SCompareLocation
321 {
operator ()SCompareLocation322 bool operator() (const CRSite& lhs, const CRSite& rhs) const
323 {
324 return lhs.GetStart() < rhs.GetStart();
325 }
326 };
327
328
329 // Class for internal use by restriction site finding.
330 // Holds a pattern in ncbi8na, integers that specify
331 // which enzyme (of a sequential collection)
332 // and which specifity of that
333 // enzyme it represents, a field indicating
334 // whether it represents the complement of the specificity,
335 // and the length of the initial subpattern that was
336 // put into the fsm.
337 class CPatternRec
338 {
339 public:
CPatternRec(string pattern,int enzyme_index,int spec_index,ENa_strand strand,unsigned int fsm_pat_size)340 CPatternRec(string pattern, int enzyme_index, int spec_index,
341 ENa_strand strand,
342 unsigned int fsm_pat_size) : m_Pattern(pattern),
343 m_EnzymeIndex(enzyme_index),
344 m_SpecIndex(spec_index),
345 m_Strand(strand),
346 m_FsmPatSize(fsm_pat_size) {}
347 // member access
GetPattern(void) const348 const string& GetPattern(void) const {return m_Pattern;}
GetEnzymeIndex(void) const349 int GetEnzymeIndex(void) const {return m_EnzymeIndex;}
GetSpecIndex(void) const350 int GetSpecIndex(void) const {return m_SpecIndex;}
GetStrand(void) const351 ENa_strand GetStrand(void) const {return m_Strand;}
GetFsmPatSize(void) const352 unsigned int GetFsmPatSize(void) const {return m_FsmPatSize;}
353 private:
354 // pattern in ncbi8na
355 string m_Pattern;
356 // which enzyme and specificity we represent
357 int m_EnzymeIndex;
358 int m_SpecIndex;
359 // whether we represent the complement of the specificity
360 ENa_strand m_Strand;
361 unsigned int m_FsmPatSize;
362 };
363
364
365 // helper functor for sorting CRef<CREnzResult> by the enzyme name
366 struct SEnzymeNameCompare
367 {
operator ()SEnzymeNameCompare368 bool operator()
369 (const CRef<CREnzResult>& lhs, const CRef<CREnzResult>& rhs) const
370 {
371 return lhs->GetEnzymeName() < rhs->GetEnzymeName();
372 }
373 };
374
375
376 // helper functor for sorting CREnzyme by the enzyme name
377 struct SNameCompare
378 {
operator ()SNameCompare379 bool operator()
380 (const CREnzyme& lhs, const CREnzyme& rhs) const
381 {
382 return lhs.GetName() < rhs.GetName();
383 }
384 };
385
386
387 // helper functor for sorting by the number of definite sites
388 struct SLessDefSites
389 {
operator ()SLessDefSites390 bool operator()
391 (const CRef<CREnzResult>& lhs, const CRef<CREnzResult>& rhs) const
392 {
393 return lhs->GetDefiniteSites().size() < rhs->GetDefiniteSites().size();
394 }
395 };
396
397
398 // helper functor for sorting CRef<CSeq_loc>s by location
399 struct SLessSeq_loc
400 {
operator ()SLessSeq_loc401 bool operator()
402 (const CRef<CSeq_loc>& lhs, const CRef<CSeq_loc>& rhs) const
403 {
404 return (lhs->Compare(*rhs, CSeq_loc::fCompare_Default) < 0);
405 }
406 };
407
408
409
410
x_ExpandRecursion(string & s,unsigned int pos,CTextFsm<int> & fsm,int match_value)411 void CFindRSites::x_ExpandRecursion(string& s, unsigned int pos,
412 CTextFsm<int>& fsm, int match_value)
413 {
414 if (pos == s.size()) {
415 // this is the place
416 fsm.AddWord(s, match_value);
417 return;
418 }
419
420 char orig_ch = s[pos];
421 for (char x = 1; x <= 8; x <<= 1) {
422 if (orig_ch & x) {
423 s[pos] = x;
424 x_ExpandRecursion(s, pos + 1, fsm, match_value);
425 }
426 }
427 // restore original character
428 s[pos] = orig_ch;
429 }
430
431
CFindRSites(const string & refile,CRebase::EEnzymesToLoad which_enzymes,TFlags flags)432 CFindRSites::CFindRSites(const string& refile,
433 CRebase::EEnzymesToLoad which_enzymes,
434 TFlags flags)
435 : m_Flags(flags)
436 {
437 x_LoadREnzymeData(refile.empty()
438 ? CRebase::GetDefaultDataPath()
439 : refile,
440 which_enzymes);
441 }
442
443
GetEnzymes()444 const CFindRSites::TEnzymes& CFindRSites::GetEnzymes()
445 {
446 return m_Enzymes;
447 }
448
449
SetEnzymes()450 CFindRSites::TEnzymes& CFindRSites::SetEnzymes()
451 {
452 return m_Enzymes;
453 }
454
455
x_LoadREnzymeData(const string & refile,CRebase::EEnzymesToLoad which_enzymes)456 void CFindRSites::x_LoadREnzymeData(const string& refile,
457 CRebase::EEnzymesToLoad which_enzymes)
458 {
459 if (! refile.empty()) {
460 ifstream istr(refile.c_str());
461 CRebase::ReadNARFormat(istr, m_Enzymes, which_enzymes);
462
463 if (m_Flags & fCombineIsoschizomers) {
464 // first sort alphabetically by enzyme name
465 sort(m_Enzymes.begin(), m_Enzymes.end(), SNameCompare());
466 // now combine isoschizomers
467 CREnzyme::CombineIsoschizomers(m_Enzymes);
468 }
469 }
470 }
471
472
473 // remap a child location to a parent
s_RemapChildToParent(const CSeq_loc & parent,const CSeq_loc & child,CScope * scope)474 static CRef<CSeq_loc> s_RemapChildToParent(const CSeq_loc& parent,
475 const CSeq_loc& child,
476 CScope* scope)
477 {
478 CSeq_loc dummy_parent;
479 dummy_parent.SetWhole(const_cast<CSeq_id&>(sequence::GetId(parent, 0)));
480 SRelLoc converter(dummy_parent, child, scope, SRelLoc::fNoMerge);
481 converter.m_ParentLoc = &parent;
482 return converter.Resolve(scope);
483 }
484
485 static
s_AddSitesToAnnot(const vector<CRSite> & sites,const CREnzResult & result,CSeq_annot & annot,CScope & scope,const CSeq_loc & parent_loc,bool definite=true)486 void s_AddSitesToAnnot(const vector<CRSite>& sites,
487 const CREnzResult& result,
488 CSeq_annot& annot,
489 CScope& scope,
490 const CSeq_loc& parent_loc,
491 bool definite = true)
492 {
493 const CSeq_id& id = sequence::GetId(parent_loc, 0);
494
495 ITERATE (vector<CRSite>, site, sites) {
496 // create feature
497 CRef<CSeq_feat> feat(new CSeq_feat());
498
499 // start to set up Rsite
500 feat->SetData().SetRsite().SetDb().SetDb("REBASE");
501 feat->SetData().SetRsite().SetDb()
502 .SetTag().SetStr("REBASE");
503
504 string str(result.GetEnzymeName());
505 if ( !definite ) {
506 str = "Possible " + str;
507 }
508 feat->SetData().SetRsite().SetStr(str);
509
510 //
511 // build our location
512 //
513
514 vector< CRef<CSeq_loc> > locs;
515
516 // a loc for the recognition site
517 CRef<CSeq_loc> recog_site(new CSeq_loc);
518 recog_site->SetInt().SetFrom(site->GetStart());
519 recog_site->SetInt().SetTo (site->GetEnd());
520 recog_site->SetInt().SetStrand(site->GetStrand());
521 recog_site->SetId(id);
522 locs.push_back(recog_site);
523
524 ENa_strand cut_strand =
525 IsReverse(site->GetStrand()) ? eNa_strand_minus
526 : eNa_strand_plus;
527
528 // locs for the cleavage sites
529 int negative_cut_locs = 0; // count these exceptions
530 ITERATE (vector<int>, cut, site->GetPlusCuts()) {
531 if (*cut >= 0 ) {
532 CRef<CSeq_loc> cut_site(new CSeq_loc);
533 cut_site->SetPnt().SetPoint(*cut);
534 // indicate that the cut is to the "left"
535 cut_site->SetPnt()
536 .SetFuzz().SetLim(CInt_fuzz::eLim_tl);
537 cut_site->SetPnt().SetStrand(cut_strand);
538 cut_site->SetId(id);
539 locs.push_back(cut_site);
540 } else {
541 negative_cut_locs++;
542 }
543 }
544
545 ITERATE (vector<int>, cut, site->GetMinusCuts()) {
546 if (*cut >= 0 ) {
547 CRef<CSeq_loc> cut_site(new CSeq_loc);
548 cut_site->SetPnt().SetPoint(*cut);
549 // indicate that the cut is to the "left"
550 cut_site->SetPnt()
551 .SetFuzz().SetLim(CInt_fuzz::eLim_tl);
552 cut_site->SetPnt().SetStrand(Reverse(cut_strand));
553 cut_site->SetId(id);
554 locs.push_back(cut_site);
555 } else {
556 negative_cut_locs++;
557 }
558 }
559
560 // comment for those few cases where there are
561 // cuts before the sequence begins
562 if (negative_cut_locs > 0) {
563 string a_comm = NStr::IntToString(negative_cut_locs)
564 + " cleavage sites are located before the"
565 " beginning of the sequence and are not reported";
566 feat->SetComment(a_comm);
567 }
568
569 sort(locs.begin(), locs.end(), SLessSeq_loc());
570 copy(locs.begin(), locs.end(),
571 back_inserter(feat->SetLocation().SetMix().Set()));
572
573 CRef<CSeq_loc> mapped =
574 s_RemapChildToParent(parent_loc, feat->GetLocation(), &scope);
575 feat->SetLocation(*mapped);
576
577 // save in annot
578 annot.SetData().SetFtable().push_back(feat);
579 }
580 }
581
582
583 CFindRSites::TAnnot
GetAnnot(CScope & scope,const CSeq_loc & loc) const584 CFindRSites::GetAnnot(CScope& scope, const CSeq_loc& loc) const
585 {
586 CTime now(CTime::eCurrent);
587 // get sequence in binary (8na) form
588 CSeqVector vec(loc, scope, CBioseq_Handle::eCoding_Ncbi);
589
590 // a place to store results (one per enzyme)
591 typedef vector<CRef<CREnzResult> > TResults;
592 TResults results;
593
594 // do the big search
595 Find(vec, m_Enzymes, results, m_Flags);
596
597 // deal with stored enzyme results
598
599 sort(results.begin(), results.end(), SEnzymeNameCompare());
600
601 if (m_Flags & fSortByNumberOfSites) {
602 // do a stablesort so alphabetical order preserved
603 // within sets with the same number of sites
604 stable_sort(results.begin(), results.end(), SLessDefSites());
605 }
606
607 TAnnot annot;
608 int total_definite_sites = 0, total_possible_sites = 0;
609 int total_non_cutters = 0;
610
611 ITERATE (TResults, result, results) {
612 const vector<CRSite>& definite_sites =
613 (*result)->GetDefiniteSites();
614 const vector<CRSite>& possible_sites =
615 (*result)->GetPossibleSites();
616
617 int count_definite_sites = definite_sites.size();
618 int count_possible_sites = possible_sites.size();
619
620 if (count_definite_sites || count_possible_sites) {
621 total_definite_sites += count_definite_sites;
622 total_possible_sites += count_possible_sites;
623
624 if (m_Flags & fSplitAnnotsByEnzyme || annot.empty()) {
625 CRef<CSeq_annot> new_annot(new CSeq_annot);
626 // add description to annot
627 const string title("Restriction sites");
628 new_annot->SetNameDesc(title);
629 new_annot->SetTitleDesc(title);
630 new_annot->SetCreateDate(now);
631 CRef<CAnnotdesc> region(new CAnnotdesc);
632 region->SetRegion().Assign(loc);
633 new_annot->SetDesc().Set().push_back(region);
634 annot.push_back(new_annot);
635 }
636 CSeq_annot& curr_annot(*annot.back());
637
638 //
639 // add features to annot
640 //
641 s_AddSitesToAnnot(definite_sites,
642 **result, curr_annot, scope, loc);
643 s_AddSitesToAnnot(possible_sites,
644 **result, curr_annot, scope, loc, false);
645 } else {
646 ++total_non_cutters;
647 }
648 }
649
650 /**
651 // in order to build dialog efficiently,
652 // pre-allocate rows
653 int total_rows = total_definite_sites + total_possible_sites
654 + total_non_cutters;
655 **/
656
657 _TRACE("Found " << total_definite_sites << " definite and "
658 << total_possible_sites << " possible sites");
659
660 return annot;
661 }
662
663
664 CFindRSites::TAnnot
GetAnnot(CBioseq_Handle bsh) const665 CFindRSites::GetAnnot(CBioseq_Handle bsh) const
666 {
667 CSeq_id_Handle idh = sequence::GetId(bsh, sequence::eGetId_Canonical);
668 CRef<CSeq_loc> loc(new CSeq_loc);
669 loc->SetWhole().Assign(*idh.GetSeqId());
670 return GetAnnot(bsh.GetScope(), *loc);
671 }
672
673
x_AddPattern(const string & pat,CTextFsm<int> & fsm,int match_value)674 void CFindRSites::x_AddPattern(const string& pat, CTextFsm<int>& fsm,
675 int match_value)
676 {
677 string s = pat;
678 x_ExpandRecursion(s, 0, fsm, match_value);
679 }
680
681
x_IsAmbig(char nuc)682 bool CFindRSites::x_IsAmbig (char nuc)
683 {
684 static const bool ambig_table[16] = {
685 0, 0, 0, 1, 0, 1, 1, 1,
686 0, 1, 1, 1, 1, 1, 1, 1
687 };
688 return ambig_table[(size_t) nuc];
689 }
690
691
692 /// Find all definite and possible sites in a sequence
693 /// for a vector of enzymes, using a finite state machine.
694 /// Templatized so that seq can be various containers
695 /// (e.g., string, vector<char>, CSeqVector),
696 /// but it must yield ncbi8na.
697 template<class Seq>
x_FindRSite(const Seq & seq,const CFindRSites::TEnzymes & enzymes,vector<CRef<CREnzResult>> & results,CFindRSites::TFlags flags)698 void x_FindRSite(const Seq& seq, const CFindRSites::TEnzymes& enzymes,
699 vector<CRef<CREnzResult> >& results,
700 CFindRSites::TFlags flags)
701 {
702
703 results.clear();
704 results.reserve(enzymes.size());
705
706 // vector of patterns for internal use
707 vector<CPatternRec> patterns;
708 patterns.reserve(enzymes.size()); // an underestimate
709
710 // the finite state machine for the search
711 CTextFsm<int> fsm;
712
713 // iterate over enzymes
714 ITERATE (CFindRSites::TEnzymes, enzyme, enzymes) {
715 // Unless isoschizomers are requested, skip them.
716 if (! (flags & CFindRSites::fFindIsoschizomers ||
717 enzyme->IsPrototype())) {
718 continue;
719 }
720
721 results.push_back(CRef<CREnzResult>
722 (new CREnzResult(enzyme->GetName())));
723
724 string pat;
725 const vector<CRSpec>& specs = enzyme->GetSpecs();
726
727 // iterate over specificities for this enzyme
728 ITERATE (vector<CRSpec>, spec, specs) {
729 // one or two patterns for this specificity
730 // (one for pallindrome, two otherwise)
731
732 // to avoid combinatorial explosion,
733 // if there are more than two Ns only use
734 // part of pattern before first N
735 CSeqMatch::IupacToNcbi8na(spec->GetSeq(), pat);
736 // Check if the pattern is pallindromic.
737 string comp = pat;
738 CSeqMatch::CompNcbi8na(comp);
739 ENa_strand strand =
740 (comp == pat) ? eNa_strand_both : eNa_strand_plus;
741
742 SIZE_TYPE fsm_pat_size = pat.find_first_of(0x0f);
743 {{
744 SIZE_TYPE pos = pat.find_first_of(0x0f, fsm_pat_size + 1);
745 if (pos == NPOS
746 || pat.find_first_of(0x0f, pos + 1) == NPOS) {
747 fsm_pat_size = pat.size();
748 }
749 }}
750
751 patterns.push_back(CPatternRec(pat, enzyme - enzymes.begin(),
752 spec - specs.begin(),
753 strand,
754 fsm_pat_size));
755 // add pattern to fsm
756 // (add only fsm_pat_size of it)
757 CFindRSites::x_AddPattern(pat.substr(0, fsm_pat_size), fsm,
758 patterns.size() - 1);
759
760 // If not palidromic do a search for its complement too
761 if (strand != eNa_strand_both) {
762 {{
763 fsm_pat_size = comp.find_first_of(0x0f);
764 SIZE_TYPE pos = comp.find_first_of(0x0f, fsm_pat_size + 1);
765 if (pos == NPOS
766 || comp.find_first_of(0x0f, pos + 1) == NPOS) {
767 fsm_pat_size = comp.size();
768 }
769 }}
770 patterns.push_back(CPatternRec(comp, enzyme
771 - enzymes.begin(),
772 spec - specs.begin(),
773 Reverse(strand),
774 fsm_pat_size));
775 // add pattern to fsm
776 // (add only fsm_pat_size of it)
777 CFindRSites::x_AddPattern(comp.substr(0, fsm_pat_size), fsm,
778 patterns.size() - 1);
779 }
780 }
781 }
782
783 // The fsm is set up.
784 // Now do the search.
785
786 fsm.Prime();
787 vector<int> ambig_nucs; // for dealing with ambiguities later
788
789 int state = fsm.GetInitialState();
790 for (unsigned int i = 0; i < seq.size(); i++) {
791 if (CFindRSites::x_IsAmbig(seq[i])) {
792 ambig_nucs.push_back(i);
793 }
794 state = fsm.GetNextState(state, seq[i]);
795 if (fsm.IsMatchFound(state)) {
796 const vector<int>& matches = fsm.GetMatches(state);
797 ITERATE (vector<int>, match, matches) {
798 const CPatternRec& pattern = patterns[*match];
799 TSeqPos begin_pos = i + 1 - pattern.GetFsmPatSize();
800 TSeqPos end_pos = begin_pos
801 + pattern.GetPattern().size() - 1;
802
803 // check for a full match to sequence
804 if (pattern.GetFsmPatSize()
805 != pattern.GetPattern().size()) {
806 // Pattern put into fsm was less than the full pattern.
807 // Must check that the sequence really matches.
808 if (end_pos >= seq.size()) {
809 // full pattern goes off the end of sequence;
810 // not a match
811 continue;
812 }
813 // could really test match for just
814 // right end of pattern here
815 if (CSeqMatch::MatchNcbi8na(seq, pattern.GetPattern(),
816 begin_pos)
817 == CSeqMatch::eYes) {
818 // There must be a site here. However, because
819 // we'll later check all stretches containing
820 // ambiguities, we must ignore them here to keep
821 // from recording them twice.
822 bool ambig = false;
823 for (unsigned int n = begin_pos; n <= end_pos;
824 n++) {
825 if (CFindRSites::x_IsAmbig(seq[n])) {
826 ambig = true;
827 break;
828 }
829 }
830 if (ambig) {
831 continue; // ignore matches with ambig. seq.
832 }
833 } else {
834 continue; // not a real match
835 }
836 }
837 const CREnzyme& enzyme =
838 enzymes[pattern.GetEnzymeIndex()];
839 const CRSpec& spec =
840 enzyme.GetSpecs()[pattern.GetSpecIndex()];
841
842 CRSite site(begin_pos, end_pos);
843 site.SetStrand(pattern.GetStrand());
844 const vector<int>& plus_cuts = spec.GetPlusCuts();
845 ITERATE (vector<int>, cut, plus_cuts) {
846 if (IsReverse(pattern.GetStrand())) {
847 site.SetPlusCuts()
848 .push_back(begin_pos
849 + pattern.GetPattern().size()
850 - *cut);
851 } else {
852 site.SetPlusCuts().push_back(begin_pos + *cut);
853 }
854 }
855 const vector<int>& minus_cuts = spec.GetMinusCuts();
856 ITERATE (vector<int>, cut, minus_cuts) {
857 if (IsReverse(pattern.GetStrand())) {
858 site.SetMinusCuts()
859 .push_back(begin_pos
860 + pattern.GetPattern().size()
861 - *cut);
862 } else {
863 site.SetMinusCuts().push_back(begin_pos + *cut);
864 }
865 }
866
867 results[pattern.GetEnzymeIndex()]->SetDefiniteSites()
868 .push_back(site);
869 }
870 }
871 } // end iteration over the sequence
872
873 // We've found all the matches involving unambiguous characters
874 // in sequence. Now go back and examine any ambiguous places.
875 if (!ambig_nucs.empty()) {
876 ITERATE (vector<CPatternRec>, pattern, patterns) {
877 const string& pat = pattern->GetPattern();
878
879 // for reordering later
880 int ds_pos = results[pattern->GetEnzymeIndex()]
881 ->GetDefiniteSites().size();
882 int ps_pos = results[pattern->GetEnzymeIndex()]
883 ->GetPossibleSites().size();
884
885 // the next possible (starting) position to check
886 int next_pos = 0;
887
888 // iterate over ambiguous positions
889 ITERATE (vector<int>, pos, ambig_nucs) {
890 int begin_check = *pos - pat.size() + 1;
891 // don't try to check a negative position
892 begin_check = max(begin_check, 0);
893 // to avoid recording a site multiple times
894 // when there are two nearby ambiguities
895 begin_check = max(begin_check, next_pos);
896 int end_check = min(*pos, (int) (seq.size() - pat.size()));
897 int i;
898 for (i = begin_check; i <= end_check; i++) {
899 CSeqMatch::EMatch match
900 = CSeqMatch::MatchNcbi8na(seq, pat, i);
901 if (match == CSeqMatch::eNo) {
902 continue; // no match of any sort
903 }
904 // otherwise, a definite or possible site
905
906 CRSite site(i, i + pat.size() -1);
907 site.SetStrand(pattern->GetStrand());
908
909 // figure out cleavage locations
910 const vector<int>& plus_cuts
911 = enzymes[pattern->GetEnzymeIndex()]
912 .GetSpecs()[pattern->GetSpecIndex()].GetPlusCuts();
913 ITERATE (vector<int>, cut, plus_cuts) {
914 if (IsReverse(pattern->GetStrand())) {
915 site.SetPlusCuts()
916 .push_back(i + pattern->GetPattern().size()
917 - *cut);
918 } else {
919 site.SetPlusCuts().push_back(i + *cut);
920 }
921 }
922
923 const vector<int>& minus_cuts
924 = enzymes[pattern->GetEnzymeIndex()]
925 .GetSpecs()[pattern->GetSpecIndex()]
926 .GetMinusCuts();
927 ITERATE (vector<int>, cut, minus_cuts) {
928 if (IsReverse(pattern->GetStrand())) {
929 site.SetMinusCuts()
930 .push_back(i + pattern->GetPattern().size()
931 - *cut);
932 } else {
933 site.SetMinusCuts().push_back(i + *cut);
934 }
935 }
936
937
938 if (match == CSeqMatch::eYes) {
939 results[pattern->GetEnzymeIndex()]
940 ->SetDefiniteSites().push_back(site);
941 } else if (flags & CFindRSites::fFindPossibleSites) {
942 results[pattern->GetEnzymeIndex()]
943 ->SetPossibleSites().push_back(site);
944 }
945 }
946 next_pos = i;
947 } // end ITERATE over ambiguous positions
948
949 // definite_sites and possible_sites may be out of order
950 vector<CRSite>& def_sites = results[pattern->GetEnzymeIndex()]
951 ->SetDefiniteSites();
952 inplace_merge(def_sites.begin(),
953 def_sites.begin() + ds_pos,
954 def_sites.end(),
955 SCompareLocation());
956
957 vector<CRSite>& pos_sites = results[pattern->GetEnzymeIndex()]
958 ->SetPossibleSites();
959 inplace_merge(pos_sites.begin(),
960 pos_sites.begin() + ps_pos,
961 pos_sites.end(),
962 SCompareLocation());
963
964 } // end ITERATE over patterns
965 } // end if (!ambig_nucs.empty())
966 }
967
968
969 // CFindRSites::Find for various sequence containers
970
Find(const string & seq,const TEnzymes & enzymes,vector<CRef<CREnzResult>> & results,TFlags flags)971 void CFindRSites::Find(const string& seq,
972 const TEnzymes& enzymes,
973 vector<CRef<CREnzResult> >& results,
974 TFlags flags)
975 {
976 x_FindRSite(seq, enzymes, results, flags);
977 }
978
979
Find(const vector<char> & seq,const TEnzymes & enzymes,vector<CRef<CREnzResult>> & results,TFlags flags)980 void CFindRSites::Find(const vector<char>& seq,
981 const TEnzymes& enzymes,
982 vector<CRef<CREnzResult> >& results,
983 TFlags flags)
984 {
985 x_FindRSite(seq, enzymes, results, flags);
986 }
987
988
Find(const CSeqVector & seq,const TEnzymes & enzymes,vector<CRef<CREnzResult>> & results,TFlags flags)989 void CFindRSites::Find(const CSeqVector& seq,
990 const TEnzymes& enzymes,
991 vector<CRef<CREnzResult> >& results,
992 TFlags flags)
993 {
994 string seq_ncbi8na;
995 CSeqVector vec(seq);
996 vec.SetNcbiCoding();
997 vec.GetSeqData(0, vec.size(), seq_ncbi8na);
998 x_FindRSite(seq_ncbi8na, enzymes, results, flags);
999 }
1000
1001
1002
1003 END_NCBI_SCOPE
1004