1 /*  $Id: restriction.cpp 623045 2021-01-07 14:09:35Z ivanov $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors:  Josh Cherry
27  *
28  * File Description:  Classes for representing and finding restriction sites
29  *
30  */
31 
32 
33 #include <ncbi_pch.hpp>
34 #include <algorithm>
35 #include <corelib/ncbistd.hpp>
36 
37 #include <objects/general/Dbtag.hpp>
38 #include <objects/general/Object_id.hpp>
39 #include <objects/seqfeat/Rsite_ref.hpp>
40 #include <objects/seqloc/Seq_point.hpp>
41 #include <objects/seq/Annot_descr.hpp>
42 #include <objects/seq/Annotdesc.hpp>
43 
44 #include <objmgr/seq_vector.hpp>
45 #include <objmgr/util/sequence.hpp>
46 
47 #include <algo/sequence/restriction.hpp>
48 
49 
50 BEGIN_NCBI_SCOPE
51 USING_SCOPE(objects);
52 
53 NCBI_PARAM_DECL(string, RESTRICTION_SITES, REBASE);
54 NCBI_PARAM_DEF (string, RESTRICTION_SITES, REBASE, "");
55 typedef NCBI_PARAM_TYPE(RESTRICTION_SITES, REBASE) TRebaseData;
56 
57 
operator <(const CRSpec & rhs) const58 bool CRSpec::operator<(const CRSpec& rhs) const
59 {
60     if (GetSeq() != rhs.GetSeq()) {
61         return GetSeq() < rhs.GetSeq();
62     }
63     // otherwise sequences identical
64     if (GetPlusCuts() != rhs.GetPlusCuts()) {
65         return GetPlusCuts() < rhs.GetPlusCuts();
66     }
67     if (GetMinusCuts() != rhs.GetMinusCuts()) {
68         return GetMinusCuts() < rhs.GetMinusCuts();
69     }
70     // otherwise arguments are equal, and < is false
71     return false;
72 }
73 
74 
Reset(void)75 void CREnzyme::Reset(void)
76 {
77     m_Name.erase();
78     m_Specs.clear();
79 }
80 
81 
82 // helper functor for sorting enzymes by specificity
83 struct SCompareSpecs
84 {
operator ()SCompareSpecs85     bool operator()(const CREnzyme& lhs, const CREnzyme& rhs)
86     {
87         return lhs.GetSpecs() < rhs.GetSpecs();
88     }
89 };
90 
91 
CombineIsoschizomers(TEnzymes & enzymes)92 void CREnzyme::CombineIsoschizomers(TEnzymes& enzymes)
93 {
94     stable_sort(enzymes.begin(), enzymes.end(), SCompareSpecs());
95     TEnzymes result;
96     ITERATE (TEnzymes, enzyme, enzymes) {
97         if (enzyme != enzymes.begin() &&
98             enzyme->GetSpecs() == result.back().GetSpecs()) {
99             result.back().SetName() += "/";
100             result.back().SetName() += enzyme->GetName();
101         } else {
102             result.push_back(*enzyme);
103             result.back().SetPrototype();
104         }
105     }
106     swap(enzymes, result);
107 }
108 
109 
Reset(void)110 void CRSpec::Reset(void)
111 {
112     m_Seq.erase();
113     m_PlusCuts.clear();
114     m_MinusCuts.clear();
115 }
116 
117 
operator <<(ostream & os,const CRSite & site)118 ostream& operator<<(ostream& os, const CRSite& site)
119 {
120     os << "Recog. site: " << site.GetStart() << '-'
121        << site.GetEnd() << endl;
122     os << "Plus strand cuts: ";
123     string s;
124     ITERATE (vector<int>, cut, site.GetPlusCuts()) {
125         if (!s.empty()) {
126             s += " ,";
127         }
128         s += NStr::IntToString(*cut);
129     }
130     os << s << endl;
131 
132     os << "Minus strand cuts: ";
133     s.erase();
134     ITERATE (vector<int>, cut, site.GetMinusCuts()) {
135         if (!s.empty()) {
136             s += " ,";
137         }
138         s += NStr::IntToString(*cut);
139     }
140     os << s << endl;
141 
142     return os;
143 }
144 
145 
MakeRSpec(const string & site)146 CRSpec CRebase::MakeRSpec(const string& site)
147 {
148     // site contains a string such as
149     // GACGTC, GACGT^C, GCAGC(8/12), or (8/13)GAGNNNNNCTC(13/8)
150 
151     CRSpec spec;
152 
153     int plus_cut, minus_cut;
154 
155     string s = site;
156     if (s[0] == '(') {
157         string::size_type idx = s.find_first_of(")");
158         if (idx == std::string::npos) {
159             throw runtime_error(string("Error parsing site ")
160                                 + site);
161         }
162         x_ParseCutPair(s.substr(0, idx + 1), plus_cut, minus_cut);
163         s.erase(0, idx + 1);
164         spec.SetPlusCuts().push_back(-plus_cut);
165         spec.SetMinusCuts().push_back(-minus_cut);
166     }
167     if (s[s.length() - 1] == ')') {
168         string::size_type idx = s.find_last_of("(");
169         if (idx == std::string::npos) {
170             throw runtime_error(string("Error parsing site ")
171                                 + site);
172         }
173         x_ParseCutPair(s.substr(idx), plus_cut, minus_cut);
174         s.erase(idx);
175         spec.SetPlusCuts().push_back(plus_cut + s.length());
176         spec.SetMinusCuts().push_back(minus_cut + s.length());
177     }
178     for (unsigned int i = 0;  i < s.length();  i++) {
179         if (s[i] == '^') {
180 
181             // This should be simple.  If we have
182             // G^GATCC, the cut on the plus strand is
183             // at 1, and on the minus it's at the
184             // symmetric position, 5.
185             // The complication is that TspRI
186             // is given as CASTGNN^, not NNCASTGNN^.
187             // Someone someday might write NNCASTGNN^,
188             // and something like ^NNGATC might arise,
189             // so code defensively.
190 
191             s.erase(i, 1);
192             int plus_cut = i;
193 
194             // this is the slightly complicated bit
195             // trim any leading and trailing N's;
196             // in case leading N's removed, adjust plus_cut accordingly
197             string::size_type idx = s.find_first_not_of("N");
198             if (idx == string::npos) {
199                 // bizarre situation; all N's (possibly empty)
200                 s.erase();
201                 plus_cut = 0;
202             } else {
203                 s.erase(0, idx);
204                 plus_cut -= idx;
205             }
206             idx = s.find_last_not_of("N");
207             s.erase(idx + 1);
208 
209             // plus strand cut
210             spec.SetPlusCuts().push_back(plus_cut);
211             // symmetric cut on minus strand
212             spec.SetMinusCuts().push_back(s.length() - plus_cut);
213             break;  // there better be just one '^'
214         }
215     }
216     spec.SetSeq(s);
217     return spec;
218 }
219 
220 
MakeREnzyme(const string & name,const string & sites)221 CREnzyme CRebase::MakeREnzyme(const string& name, const string& sites)
222 {
223 
224     CREnzyme re;
225 
226     re.SetName(name);
227 
228     vector<string> site_vec;
229     NStr::Split(sites, ",", site_vec);
230     ITERATE (vector<string>, iter, site_vec) {
231         CRSpec spec = CRebase::MakeRSpec(*iter);
232         re.SetSpecs().push_back(spec);
233     }
234 
235     return re;
236 }
237 
238 
GetDefaultDataPath()239 string CRebase::GetDefaultDataPath()
240 {
241     return TRebaseData::GetDefault();
242 }
243 
244 
x_ParseCutPair(const string & s,int & plus_cut,int & minus_cut)245 void CRebase::x_ParseCutPair(const string& s, int& plus_cut, int& minus_cut)
246 {
247     // s should look like "(8/13)"; plus_cut gets set to 8 and minus_cut to 13
248     list<string> l;
249     NStr::Split(s.substr(1, s.length() - 2), "/", l, NStr::fSplit_Tokenize);
250     if (l.size() != 2) {
251         throw runtime_error(string("Couldn't parse cut locations ")
252                             + s);
253     }
254     plus_cut = NStr::StringToInt(l.front());
255     minus_cut = NStr::StringToInt(l.back());
256 }
257 
258 
ReadNARFormat(istream & input,TEnzymes & enzymes,enum EEnzymesToLoad which)259 void CRebase::ReadNARFormat(istream& input,
260                             TEnzymes& enzymes,
261                             enum EEnzymesToLoad which)
262 {
263     string line;
264     CREnzyme enzyme;
265     string name;
266     TEnzymes::size_type prototype_idx(0);
267 
268     while (getline(input, line)) {
269         vector<string> fields;
270         NStr::Split(line, "\t", fields);
271         // the lines we're interested in have more than one field
272         if (fields.size() < 2) {
273             continue;
274         }
275         // name of enzyme is in field one or two (field zero is empty)
276         name = fields[1];
277         bool is_prototype(true);
278         if (name == " ") {
279             if (which == ePrototype) {
280                 continue;  // skip this non-protype
281             }
282             is_prototype = false;
283             name = fields[2];
284         }
285         if (which == eCommercial && fields[5].empty()) {
286             continue;  // skip this commercially unavailable enzyme
287         }
288         string sites = fields[3];  // this contains a comma-separted list of sites,
289                             // usually just one (but TaqII has two)
290         enzyme = MakeREnzyme(name, sites);
291         enzymes.push_back(enzyme);
292 
293         // Link isoschizomers with their prototypes.
294         if (is_prototype) {
295             prototype_idx = enzymes.size();
296         } else if (prototype_idx) {
297             CREnzyme& prototype = enzymes[prototype_idx - 1];
298             enzyme.SetPrototype(prototype.GetName());
299             prototype.SetIsoschizomers().push_back(name);
300         }
301     }
302 }
303 
304 
operator <<(ostream & os,const CREnzResult & er)305 ostream& operator<<(ostream& os, const CREnzResult& er)
306 {
307     os << "Enzyme: " << er.GetEnzymeName() << endl;
308     os << er.GetDefiniteSites().size() << " definite sites:" << endl;
309     ITERATE (vector<CRSite>, site, er.GetDefiniteSites()) {
310         os << *site;
311     }
312     os << er.GetPossibleSites().size() << " possible sites:" << endl;
313     ITERATE (vector<CRSite>, site, er.GetPossibleSites()) {
314         os << *site;
315     }
316     return os;
317 }
318 
319 
320 struct SCompareLocation
321 {
operator ()SCompareLocation322     bool operator() (const CRSite& lhs, const CRSite& rhs) const
323     {
324         return lhs.GetStart() < rhs.GetStart();
325     }
326 };
327 
328 
329 // Class for internal use by restriction site finding.
330 // Holds a pattern in ncbi8na, integers that specify
331 // which enzyme (of a sequential collection)
332 // and which specifity of that
333 // enzyme it represents, a field indicating
334 // whether it represents the complement of the specificity,
335 // and the length of the initial subpattern that was
336 // put into the fsm.
337 class CPatternRec
338 {
339 public:
CPatternRec(string pattern,int enzyme_index,int spec_index,ENa_strand strand,unsigned int fsm_pat_size)340     CPatternRec(string pattern, int enzyme_index, int spec_index,
341                 ENa_strand strand,
342                 unsigned int fsm_pat_size) : m_Pattern(pattern),
343                                              m_EnzymeIndex(enzyme_index),
344                                              m_SpecIndex(spec_index),
345                                              m_Strand(strand),
346                                              m_FsmPatSize(fsm_pat_size) {}
347     // member access
GetPattern(void) const348     const string& GetPattern(void) const {return m_Pattern;}
GetEnzymeIndex(void) const349     int GetEnzymeIndex(void) const {return m_EnzymeIndex;}
GetSpecIndex(void) const350     int GetSpecIndex(void) const {return m_SpecIndex;}
GetStrand(void) const351     ENa_strand GetStrand(void) const {return m_Strand;}
GetFsmPatSize(void) const352     unsigned int GetFsmPatSize(void) const {return m_FsmPatSize;}
353 private:
354     // pattern in ncbi8na
355     string m_Pattern;
356     // which enzyme and specificity we represent
357     int m_EnzymeIndex;
358     int m_SpecIndex;
359     // whether we represent the complement of the specificity
360     ENa_strand m_Strand;
361     unsigned int m_FsmPatSize;
362 };
363 
364 
365 // helper functor for sorting CRef<CREnzResult> by the enzyme name
366 struct SEnzymeNameCompare
367 {
operator ()SEnzymeNameCompare368     bool operator()
369         (const CRef<CREnzResult>& lhs, const CRef<CREnzResult>& rhs) const
370     {
371         return lhs->GetEnzymeName() < rhs->GetEnzymeName();
372     }
373 };
374 
375 
376 // helper functor for sorting CREnzyme by the enzyme name
377 struct SNameCompare
378 {
operator ()SNameCompare379     bool operator()
380         (const CREnzyme& lhs, const CREnzyme& rhs) const
381     {
382         return lhs.GetName() < rhs.GetName();
383     }
384 };
385 
386 
387 // helper functor for sorting by the number of definite sites
388 struct SLessDefSites
389 {
operator ()SLessDefSites390     bool operator()
391         (const CRef<CREnzResult>& lhs, const CRef<CREnzResult>& rhs) const
392     {
393         return lhs->GetDefiniteSites().size() < rhs->GetDefiniteSites().size();
394     }
395 };
396 
397 
398 // helper functor for sorting CRef<CSeq_loc>s by location
399 struct SLessSeq_loc
400 {
operator ()SLessSeq_loc401     bool operator()
402         (const CRef<CSeq_loc>& lhs, const CRef<CSeq_loc>& rhs) const
403     {
404         return (lhs->Compare(*rhs, CSeq_loc::fCompare_Default) < 0);
405     }
406 };
407 
408 
409 
410 
x_ExpandRecursion(string & s,unsigned int pos,CTextFsm<int> & fsm,int match_value)411 void CFindRSites::x_ExpandRecursion(string& s, unsigned int pos,
412                                     CTextFsm<int>& fsm, int match_value)
413 {
414     if (pos == s.size()) {
415         // this is the place
416         fsm.AddWord(s, match_value);
417         return;
418     }
419 
420     char orig_ch = s[pos];
421     for (char x = 1;  x <= 8;  x <<= 1) {
422         if (orig_ch & x) {
423             s[pos] = x;
424             x_ExpandRecursion(s, pos + 1, fsm, match_value);
425         }
426     }
427     // restore original character
428     s[pos] = orig_ch;
429 }
430 
431 
CFindRSites(const string & refile,CRebase::EEnzymesToLoad which_enzymes,TFlags flags)432 CFindRSites::CFindRSites(const string& refile,
433                          CRebase::EEnzymesToLoad which_enzymes,
434                          TFlags flags)
435     : m_Flags(flags)
436 {
437     x_LoadREnzymeData(refile.empty()
438                           ? CRebase::GetDefaultDataPath()
439                           : refile,
440                       which_enzymes);
441 }
442 
443 
GetEnzymes()444 const CFindRSites::TEnzymes& CFindRSites::GetEnzymes()
445 {
446     return m_Enzymes;
447 }
448 
449 
SetEnzymes()450 CFindRSites::TEnzymes& CFindRSites::SetEnzymes()
451 {
452     return m_Enzymes;
453 }
454 
455 
x_LoadREnzymeData(const string & refile,CRebase::EEnzymesToLoad which_enzymes)456 void CFindRSites::x_LoadREnzymeData(const string& refile,
457                                     CRebase::EEnzymesToLoad which_enzymes)
458 {
459     if (! refile.empty()) {
460         ifstream istr(refile.c_str());
461         CRebase::ReadNARFormat(istr, m_Enzymes, which_enzymes);
462 
463         if (m_Flags & fCombineIsoschizomers) {
464             // first sort alphabetically by enzyme name
465             sort(m_Enzymes.begin(), m_Enzymes.end(), SNameCompare());
466             // now combine isoschizomers
467             CREnzyme::CombineIsoschizomers(m_Enzymes);
468         }
469     }
470 }
471 
472 
473 // remap a child location to a parent
s_RemapChildToParent(const CSeq_loc & parent,const CSeq_loc & child,CScope * scope)474 static CRef<CSeq_loc> s_RemapChildToParent(const CSeq_loc& parent,
475                                            const CSeq_loc& child,
476                                            CScope* scope)
477 {
478     CSeq_loc dummy_parent;
479     dummy_parent.SetWhole(const_cast<CSeq_id&>(sequence::GetId(parent, 0)));
480     SRelLoc converter(dummy_parent, child, scope, SRelLoc::fNoMerge);
481     converter.m_ParentLoc = &parent;
482     return converter.Resolve(scope);
483 }
484 
485 static
s_AddSitesToAnnot(const vector<CRSite> & sites,const CREnzResult & result,CSeq_annot & annot,CScope & scope,const CSeq_loc & parent_loc,bool definite=true)486 void s_AddSitesToAnnot(const vector<CRSite>& sites,
487                        const CREnzResult&    result,
488                        CSeq_annot&           annot,
489                        CScope&               scope,
490                        const CSeq_loc&       parent_loc,
491                        bool                  definite = true)
492 {
493     const CSeq_id& id = sequence::GetId(parent_loc, 0);
494 
495     ITERATE (vector<CRSite>, site, sites) {
496         // create feature
497         CRef<CSeq_feat> feat(new CSeq_feat());
498 
499         // start to set up Rsite
500         feat->SetData().SetRsite().SetDb().SetDb("REBASE");
501         feat->SetData().SetRsite().SetDb()
502             .SetTag().SetStr("REBASE");
503 
504         string str(result.GetEnzymeName());
505         if ( !definite ) {
506             str = "Possible " + str;
507         }
508         feat->SetData().SetRsite().SetStr(str);
509 
510         //
511         // build our location
512         //
513 
514         vector< CRef<CSeq_loc> > locs;
515 
516         // a loc for the recognition site
517         CRef<CSeq_loc> recog_site(new CSeq_loc);
518         recog_site->SetInt().SetFrom(site->GetStart());
519         recog_site->SetInt().SetTo  (site->GetEnd());
520         recog_site->SetInt().SetStrand(site->GetStrand());
521         recog_site->SetId(id);
522         locs.push_back(recog_site);
523 
524         ENa_strand cut_strand =
525             IsReverse(site->GetStrand()) ? eNa_strand_minus
526                                          : eNa_strand_plus;
527 
528         // locs for the cleavage sites
529         int negative_cut_locs = 0;  // count these exceptions
530         ITERATE (vector<int>, cut, site->GetPlusCuts()) {
531             if (*cut >= 0 ) {
532                 CRef<CSeq_loc> cut_site(new CSeq_loc);
533                 cut_site->SetPnt().SetPoint(*cut);
534                 // indicate that the cut is to the "left"
535                 cut_site->SetPnt()
536                     .SetFuzz().SetLim(CInt_fuzz::eLim_tl);
537                 cut_site->SetPnt().SetStrand(cut_strand);
538                 cut_site->SetId(id);
539                 locs.push_back(cut_site);
540             } else {
541                 negative_cut_locs++;
542             }
543         }
544 
545         ITERATE (vector<int>, cut, site->GetMinusCuts()) {
546             if (*cut >= 0 ) {
547                 CRef<CSeq_loc> cut_site(new CSeq_loc);
548                 cut_site->SetPnt().SetPoint(*cut);
549                 // indicate that the cut is to the "left"
550                 cut_site->SetPnt()
551                     .SetFuzz().SetLim(CInt_fuzz::eLim_tl);
552                 cut_site->SetPnt().SetStrand(Reverse(cut_strand));
553                 cut_site->SetId(id);
554                 locs.push_back(cut_site);
555             } else {
556                 negative_cut_locs++;
557             }
558         }
559 
560         // comment for those few cases where there are
561         // cuts before the sequence begins
562         if (negative_cut_locs > 0) {
563             string a_comm = NStr::IntToString(negative_cut_locs)
564                 + " cleavage sites are located before the"
565                 " beginning of the sequence and are not reported";
566             feat->SetComment(a_comm);
567         }
568 
569         sort(locs.begin(), locs.end(), SLessSeq_loc());
570         copy(locs.begin(), locs.end(),
571              back_inserter(feat->SetLocation().SetMix().Set()));
572 
573         CRef<CSeq_loc> mapped =
574             s_RemapChildToParent(parent_loc, feat->GetLocation(), &scope);
575         feat->SetLocation(*mapped);
576 
577         // save in annot
578         annot.SetData().SetFtable().push_back(feat);
579     }
580 }
581 
582 
583 CFindRSites::TAnnot
GetAnnot(CScope & scope,const CSeq_loc & loc) const584 CFindRSites::GetAnnot(CScope& scope, const CSeq_loc& loc) const
585 {
586     CTime now(CTime::eCurrent);
587     // get sequence in binary (8na) form
588     CSeqVector vec(loc, scope, CBioseq_Handle::eCoding_Ncbi);
589 
590     // a place to store results (one per enzyme)
591     typedef vector<CRef<CREnzResult> > TResults;
592     TResults results;
593 
594     // do the big search
595     Find(vec, m_Enzymes, results, m_Flags);
596 
597     // deal with stored enzyme results
598 
599     sort(results.begin(), results.end(), SEnzymeNameCompare());
600 
601     if (m_Flags & fSortByNumberOfSites) {
602         // do a stablesort so alphabetical order preserved
603         // within sets with the same number of sites
604         stable_sort(results.begin(), results.end(), SLessDefSites());
605     }
606 
607     TAnnot annot;
608     int total_definite_sites = 0, total_possible_sites = 0;
609     int total_non_cutters = 0;
610 
611     ITERATE (TResults, result, results) {
612         const vector<CRSite>& definite_sites =
613             (*result)->GetDefiniteSites();
614         const vector<CRSite>& possible_sites =
615             (*result)->GetPossibleSites();
616 
617         int count_definite_sites = definite_sites.size();
618         int count_possible_sites = possible_sites.size();
619 
620         if (count_definite_sites  ||  count_possible_sites) {
621             total_definite_sites += count_definite_sites;
622             total_possible_sites += count_possible_sites;
623 
624             if (m_Flags & fSplitAnnotsByEnzyme  ||  annot.empty()) {
625                 CRef<CSeq_annot> new_annot(new CSeq_annot);
626                 // add description to annot
627                 const string title("Restriction sites");
628                 new_annot->SetNameDesc(title);
629                 new_annot->SetTitleDesc(title);
630                 new_annot->SetCreateDate(now);
631                 CRef<CAnnotdesc> region(new CAnnotdesc);
632                 region->SetRegion().Assign(loc);
633                 new_annot->SetDesc().Set().push_back(region);
634                 annot.push_back(new_annot);
635             }
636             CSeq_annot& curr_annot(*annot.back());
637 
638             //
639             // add features to annot
640             //
641             s_AddSitesToAnnot(definite_sites,
642                               **result, curr_annot, scope, loc);
643             s_AddSitesToAnnot(possible_sites,
644                               **result, curr_annot, scope, loc, false);
645         } else {
646             ++total_non_cutters;
647         }
648     }
649 
650     /**
651     // in order to build dialog efficiently,
652     // pre-allocate rows
653     int total_rows = total_definite_sites + total_possible_sites
654         + total_non_cutters;
655         **/
656 
657     _TRACE("Found " << total_definite_sites << " definite and "
658            << total_possible_sites << " possible sites");
659 
660     return annot;
661 }
662 
663 
664 CFindRSites::TAnnot
GetAnnot(CBioseq_Handle bsh) const665 CFindRSites::GetAnnot(CBioseq_Handle bsh) const
666 {
667     CSeq_id_Handle idh = sequence::GetId(bsh, sequence::eGetId_Canonical);
668     CRef<CSeq_loc> loc(new CSeq_loc);
669     loc->SetWhole().Assign(*idh.GetSeqId());
670     return GetAnnot(bsh.GetScope(), *loc);
671 }
672 
673 
x_AddPattern(const string & pat,CTextFsm<int> & fsm,int match_value)674 void CFindRSites::x_AddPattern(const string& pat, CTextFsm<int>& fsm,
675                                int match_value)
676 {
677     string s = pat;
678     x_ExpandRecursion(s, 0, fsm, match_value);
679 }
680 
681 
x_IsAmbig(char nuc)682 bool CFindRSites::x_IsAmbig (char nuc)
683 {
684     static const bool ambig_table[16] = {
685         0, 0, 0, 1, 0, 1, 1, 1,
686         0, 1, 1, 1, 1, 1, 1, 1
687     };
688     return ambig_table[(size_t) nuc];
689 }
690 
691 
692 /// Find all definite and possible sites in a sequence
693 /// for a vector of enzymes, using a finite state machine.
694 /// Templatized so that seq can be various containers
695 /// (e.g., string, vector<char>, CSeqVector),
696 /// but it must yield ncbi8na.
697 template<class Seq>
x_FindRSite(const Seq & seq,const CFindRSites::TEnzymes & enzymes,vector<CRef<CREnzResult>> & results,CFindRSites::TFlags flags)698 void x_FindRSite(const Seq& seq, const CFindRSites::TEnzymes& enzymes,
699                  vector<CRef<CREnzResult> >& results,
700                  CFindRSites::TFlags flags)
701 {
702 
703     results.clear();
704     results.reserve(enzymes.size());
705 
706     // vector of patterns for internal use
707     vector<CPatternRec> patterns;
708     patterns.reserve(enzymes.size());  // an underestimate
709 
710     // the finite state machine for the search
711     CTextFsm<int> fsm;
712 
713     // iterate over enzymes
714     ITERATE (CFindRSites::TEnzymes, enzyme, enzymes) {
715         // Unless isoschizomers are requested, skip them.
716         if (! (flags & CFindRSites::fFindIsoschizomers  ||
717                enzyme->IsPrototype())) {
718             continue;
719         }
720 
721         results.push_back(CRef<CREnzResult>
722                           (new CREnzResult(enzyme->GetName())));
723 
724         string pat;
725         const vector<CRSpec>& specs = enzyme->GetSpecs();
726 
727         // iterate over specificities for this enzyme
728         ITERATE (vector<CRSpec>, spec, specs) {
729             // one or two patterns for this specificity
730             // (one for pallindrome, two otherwise)
731 
732             // to avoid combinatorial explosion,
733             // if there are more than two Ns only use
734             // part of pattern before first N
735             CSeqMatch::IupacToNcbi8na(spec->GetSeq(), pat);
736             // Check if the pattern is pallindromic.
737             string comp = pat;
738             CSeqMatch::CompNcbi8na(comp);
739             ENa_strand strand =
740                 (comp == pat) ? eNa_strand_both : eNa_strand_plus;
741 
742             SIZE_TYPE fsm_pat_size = pat.find_first_of(0x0f);
743             {{
744                 SIZE_TYPE pos = pat.find_first_of(0x0f, fsm_pat_size + 1);
745                 if (pos == NPOS
746                     ||  pat.find_first_of(0x0f, pos + 1) == NPOS) {
747                     fsm_pat_size = pat.size();
748                 }
749             }}
750 
751             patterns.push_back(CPatternRec(pat, enzyme - enzymes.begin(),
752                                            spec - specs.begin(),
753                                            strand,
754                                            fsm_pat_size));
755             // add pattern to fsm
756             // (add only fsm_pat_size of it)
757             CFindRSites::x_AddPattern(pat.substr(0, fsm_pat_size), fsm,
758                                       patterns.size() - 1);
759 
760             // If not palidromic do a search for its complement too
761             if (strand != eNa_strand_both) {
762                 {{
763                     fsm_pat_size = comp.find_first_of(0x0f);
764                     SIZE_TYPE pos = comp.find_first_of(0x0f, fsm_pat_size + 1);
765                     if (pos == NPOS
766                         ||  comp.find_first_of(0x0f, pos + 1) == NPOS) {
767                         fsm_pat_size = comp.size();
768                     }
769                 }}
770                 patterns.push_back(CPatternRec(comp, enzyme
771                                                - enzymes.begin(),
772                                                spec - specs.begin(),
773                                                Reverse(strand),
774                                                fsm_pat_size));
775                 // add pattern to fsm
776                 // (add only fsm_pat_size of it)
777                 CFindRSites::x_AddPattern(comp.substr(0, fsm_pat_size), fsm,
778                                           patterns.size() - 1);
779             }
780         }
781     }
782 
783     // The fsm is set up.
784     // Now do the search.
785 
786     fsm.Prime();
787     vector<int> ambig_nucs;  // for dealing with ambiguities later
788 
789     int state = fsm.GetInitialState();
790     for (unsigned int i = 0;  i < seq.size();  i++) {
791         if (CFindRSites::x_IsAmbig(seq[i])) {
792             ambig_nucs.push_back(i);
793         }
794         state = fsm.GetNextState(state, seq[i]);
795         if (fsm.IsMatchFound(state)) {
796             const vector<int>& matches = fsm.GetMatches(state);
797             ITERATE (vector<int>, match, matches) {
798                 const CPatternRec& pattern = patterns[*match];
799                 TSeqPos begin_pos = i + 1 - pattern.GetFsmPatSize();
800                 TSeqPos end_pos = begin_pos
801                     + pattern.GetPattern().size() - 1;
802 
803                 // check for a full match to sequence
804                 if (pattern.GetFsmPatSize()
805                     != pattern.GetPattern().size()) {
806                     // Pattern put into fsm was less than the full pattern.
807                     // Must check that the sequence really matches.
808                     if (end_pos >= seq.size()) {
809                         // full pattern goes off the end of sequence;
810                         // not a match
811                         continue;
812                     }
813                     // could really test match for just
814                     // right end of pattern here
815                     if (CSeqMatch::MatchNcbi8na(seq, pattern.GetPattern(),
816                                                 begin_pos)
817                         == CSeqMatch::eYes) {
818                         // There must be a site here.  However, because
819                         // we'll later check all stretches containing
820                         // ambiguities, we must ignore them here to keep
821                         // from recording them twice.
822                         bool ambig = false;
823                         for (unsigned int n = begin_pos;  n <= end_pos;
824                              n++) {
825                             if (CFindRSites::x_IsAmbig(seq[n])) {
826                                 ambig = true;
827                                 break;
828                             }
829                         }
830                         if (ambig) {
831                             continue;  // ignore matches with ambig. seq.
832                         }
833                     } else {
834                         continue;  // not a real match
835                     }
836                 }
837                 const CREnzyme& enzyme =
838                     enzymes[pattern.GetEnzymeIndex()];
839                 const CRSpec& spec =
840                     enzyme.GetSpecs()[pattern.GetSpecIndex()];
841 
842                 CRSite site(begin_pos, end_pos);
843                 site.SetStrand(pattern.GetStrand());
844                 const vector<int>& plus_cuts = spec.GetPlusCuts();
845                 ITERATE (vector<int>, cut, plus_cuts) {
846                     if (IsReverse(pattern.GetStrand())) {
847                         site.SetPlusCuts()
848                             .push_back(begin_pos
849                                        + pattern.GetPattern().size()
850                                        - *cut);
851                     } else {
852                         site.SetPlusCuts().push_back(begin_pos + *cut);
853                     }
854                 }
855                 const vector<int>& minus_cuts = spec.GetMinusCuts();
856                 ITERATE (vector<int>, cut, minus_cuts) {
857                     if (IsReverse(pattern.GetStrand())) {
858                         site.SetMinusCuts()
859                             .push_back(begin_pos
860                                        + pattern.GetPattern().size()
861                                        - *cut);
862                     } else {
863                         site.SetMinusCuts().push_back(begin_pos + *cut);
864                     }
865                 }
866 
867                 results[pattern.GetEnzymeIndex()]->SetDefiniteSites()
868                     .push_back(site);
869             }
870         }
871     }  // end iteration over the sequence
872 
873     // We've found all the matches involving unambiguous characters
874     // in sequence.  Now go back and examine any ambiguous places.
875     if (!ambig_nucs.empty()) {
876         ITERATE (vector<CPatternRec>, pattern, patterns) {
877             const string& pat = pattern->GetPattern();
878 
879             // for reordering later
880             int ds_pos = results[pattern->GetEnzymeIndex()]
881                 ->GetDefiniteSites().size();
882             int ps_pos = results[pattern->GetEnzymeIndex()]
883                 ->GetPossibleSites().size();
884 
885             // the next possible (starting) position to check
886             int next_pos = 0;
887 
888             // iterate over ambiguous positions
889             ITERATE (vector<int>, pos, ambig_nucs) {
890                 int begin_check = *pos - pat.size() + 1;
891                 // don't try to check a negative position
892                 begin_check = max(begin_check, 0);
893                 // to avoid recording a site multiple times
894                 // when there are two nearby ambiguities
895                 begin_check = max(begin_check, next_pos);
896                 int end_check = min(*pos, (int) (seq.size() - pat.size()));
897                 int i;
898                 for (i = begin_check;  i <= end_check;  i++) {
899                     CSeqMatch::EMatch match
900                         = CSeqMatch::MatchNcbi8na(seq, pat, i);
901                     if (match == CSeqMatch::eNo) {
902                         continue;  // no match of any sort
903                     }
904                     // otherwise, a definite or possible site
905 
906                     CRSite site(i, i + pat.size() -1);
907                     site.SetStrand(pattern->GetStrand());
908 
909                     // figure out cleavage locations
910                     const vector<int>& plus_cuts
911                         = enzymes[pattern->GetEnzymeIndex()]
912                         .GetSpecs()[pattern->GetSpecIndex()].GetPlusCuts();
913                     ITERATE (vector<int>, cut, plus_cuts) {
914                         if (IsReverse(pattern->GetStrand())) {
915                             site.SetPlusCuts()
916                                 .push_back(i + pattern->GetPattern().size()
917                                            - *cut);
918                         } else {
919                             site.SetPlusCuts().push_back(i + *cut);
920                         }
921                     }
922 
923                     const vector<int>& minus_cuts
924                         = enzymes[pattern->GetEnzymeIndex()]
925                         .GetSpecs()[pattern->GetSpecIndex()]
926                         .GetMinusCuts();
927                     ITERATE (vector<int>, cut, minus_cuts) {
928                         if (IsReverse(pattern->GetStrand())) {
929                             site.SetMinusCuts()
930                                 .push_back(i + pattern->GetPattern().size()
931                                            - *cut);
932                         } else {
933                             site.SetMinusCuts().push_back(i + *cut);
934                         }
935                     }
936 
937 
938                     if (match == CSeqMatch::eYes) {
939                         results[pattern->GetEnzymeIndex()]
940                             ->SetDefiniteSites().push_back(site);
941                     } else if (flags & CFindRSites::fFindPossibleSites) {
942                         results[pattern->GetEnzymeIndex()]
943                             ->SetPossibleSites().push_back(site);
944                     }
945                 }
946                 next_pos = i;
947             }  // end ITERATE over ambiguous positions
948 
949             // definite_sites and possible_sites may be out of order
950             vector<CRSite>& def_sites = results[pattern->GetEnzymeIndex()]
951                 ->SetDefiniteSites();
952             inplace_merge(def_sites.begin(),
953                           def_sites.begin() + ds_pos,
954                           def_sites.end(),
955                           SCompareLocation());
956 
957             vector<CRSite>& pos_sites = results[pattern->GetEnzymeIndex()]
958                 ->SetPossibleSites();
959             inplace_merge(pos_sites.begin(),
960                           pos_sites.begin() + ps_pos,
961                           pos_sites.end(),
962                           SCompareLocation());
963 
964         }  // end ITERATE over patterns
965     }  // end if (!ambig_nucs.empty())
966 }
967 
968 
969 // CFindRSites::Find for various sequence containers
970 
Find(const string & seq,const TEnzymes & enzymes,vector<CRef<CREnzResult>> & results,TFlags flags)971 void CFindRSites::Find(const string& seq,
972                        const TEnzymes& enzymes,
973                        vector<CRef<CREnzResult> >& results,
974                        TFlags flags)
975 {
976     x_FindRSite(seq, enzymes, results, flags);
977 }
978 
979 
Find(const vector<char> & seq,const TEnzymes & enzymes,vector<CRef<CREnzResult>> & results,TFlags flags)980 void CFindRSites::Find(const vector<char>& seq,
981                        const TEnzymes& enzymes,
982                        vector<CRef<CREnzResult> >& results,
983                        TFlags flags)
984 {
985     x_FindRSite(seq, enzymes, results, flags);
986 }
987 
988 
Find(const CSeqVector & seq,const TEnzymes & enzymes,vector<CRef<CREnzResult>> & results,TFlags flags)989 void CFindRSites::Find(const CSeqVector& seq,
990                        const TEnzymes& enzymes,
991                        vector<CRef<CREnzResult> >& results,
992                        TFlags flags)
993 {
994     string seq_ncbi8na;
995     CSeqVector vec(seq);
996     vec.SetNcbiCoding();
997     vec.GetSeqData(0, vec.size(), seq_ncbi8na);
998     x_FindRSite(seq_ncbi8na, enzymes, results, flags);
999 }
1000 
1001 
1002 
1003 END_NCBI_SCOPE
1004