1 /*  $Id: gaps_edit.cpp 632623 2021-06-03 17:38:11Z ivanov $
2 * ===========================================================================
3 *
4 *                            PUBLIC DOMAIN NOTICE
5 *               National Center for Biotechnology Information
6 *
7 *  This software/database is a "United States Government Work" under the
8 *  terms of the United States Copyright Act.  It was written as part of
9 *  the author's official duties as a United States Government employee and
10 *  thus cannot be copyrighted.  This software/database is freely available
11 *  to the public for use. The National Library of Medicine and the U.S.
12 *  Government have not placed any restriction on its use or reproduction.
13 *
14 *  Although all reasonable efforts have been taken to ensure the accuracy
15 *  and reliability of the software and data, the NLM and the U.S.
16 *  Government do not and cannot warrant the performance or results that
17 *  may be obtained by using this software or data. The NLM and the U.S.
18 *  Government disclaim all warranties, express or implied, including
19 *  warranties of performance, merchantability or fitness for any particular
20 *  purpose.
21 *
22 *  Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Authors:  Sergiy Gotvyanskyy, NCBI
27 *
28 * File Description:
29 *   Converts runs of 'N' or 'n' into a gap
30 *
31 * ===========================================================================
32 */
33 
34 #include <ncbi_pch.hpp>
35 
36 #include <corelib/ncbiutil.hpp>
37 
38 #include <objects/seq/Bioseq.hpp>
39 #include <objects/seqset/Bioseq_set.hpp>
40 #include <objects/seqset/Seq_entry.hpp>
41 #include <objects/seq/Delta_ext.hpp>
42 #include <objects/seq/Delta_seq.hpp>
43 #include <objects/seq/Seq_gap.hpp>
44 #include <objects/seq/Seq_literal.hpp>
45 #include <objects/general/Int_fuzz.hpp>
46 #include <objects/seq/Seq_inst.hpp>
47 #include <objects/seq/Seq_ext.hpp>
48 
49 #include <util/sequtil/sequtil_convert.hpp>
50 
51 #include <objtools/edit/gaps_edit.hpp>
52 
53 BEGIN_NCBI_SCOPE
54 BEGIN_SCOPE(objects)
55 
56 namespace
57 {
58 
Make_Iupacna(const CSeq_data & data,string & decoded,TSeqPos len)59 bool Make_Iupacna(const CSeq_data& data, string& decoded, TSeqPos len)
60 {
61     CSeqUtil::TCoding src_coding = CSeqUtil::e_not_set;
62     CSeqUtil::TCoding dst_coding = CSeqUtil::e_Iupacna;
63     CTempString      src;
64     switch (data.Which()) {
65 #define CODING_CASE(x) \
66     case CSeq_data::e_##x: \
67         src.assign(&data.Get##x().Get()[0], data.Get##x().Get().size()); \
68         src_coding = CSeqUtil::e_##x; \
69         CSeqConvert::Convert(src, src_coding, 0, len, decoded,  dst_coding); \
70         break;
71     CODING_CASE(Ncbi2na)
72     CODING_CASE(Iupacna)
73     CODING_CASE(Iupacaa)
74     CODING_CASE(Ncbi4na)
75     CODING_CASE(Ncbi8na)
76     CODING_CASE(Ncbi8aa)
77     CODING_CASE(Ncbieaa)
78     CODING_CASE(Ncbistdaa)
79 #undef CODING_CASE
80     default:
81 //        ERR_POST_X(1, Warning << "PackWithGaps: unsupported encoding "
82 //                      << CSeq_data::SelectionName(data.Which()));
83         return false;
84     }
85     return true;
86 }
87 
88 
MakeGap(const CSeq_data & data,TSeqPos len,CDelta_ext & ext,TSeqPos gap_start,TSeqPos gap_length)89 CRef<CDelta_seq> MakeGap(const CSeq_data& data, TSeqPos len, CDelta_ext& ext,  TSeqPos gap_start, TSeqPos gap_length)
90 {
91     string decoded;
92     if (!Make_Iupacna(data, decoded, len))
93         return CRef<CDelta_seq>();
94 
95     if (gap_start>0)
96     {
97         ext.AddAndSplit(CTempString(decoded, 0, gap_start), CSeq_data::e_Iupacna, gap_start, true, true);
98     }
99 
100     // here we need to check if segment is gap in fact
101     CDelta_seq& gap = ext.AddLiteral(gap_length);
102 
103     if (gap_start+gap_length < decoded.length())
104     {
105         ext.AddAndSplit(
106             CTempString(decoded, gap_start+gap_length, TSeqPos(decoded.length())-gap_start-gap_length),
107             CSeq_data::e_Iupacna, TSeqPos(decoded.length())-gap_start-gap_length, true, true);
108     }
109 
110     return CRef<CDelta_seq>(&gap);
111 }
112 
CloneLiteral(const CDelta_seq & delta,CDelta_ext & ext,TSeqPos len)113 CDelta_seq& CloneLiteral(const CDelta_seq& delta, CDelta_ext& ext, TSeqPos len)
114 {
115     CDelta_seq& new_delta = ext.AddLiteral(len);
116     if (delta.GetLiteral().IsSetSeq_data())
117         new_delta.SetLiteral().SetSeq_data().Assign(delta.GetLiteral().GetSeq_data());
118 
119     return new_delta;
120 }
121 
MakeGap(const CDelta_seq & delta,TSeqPos len,CDelta_ext & ext,TSeqPos gap_start,TSeqPos gap_length)122 CRef<CDelta_seq> MakeGap(const CDelta_seq& delta, TSeqPos len, CDelta_ext& ext,  TSeqPos gap_start, TSeqPos gap_length)
123 {
124     if (delta.IsLiteral() && delta.GetLiteral().IsSetSeq_data())
125        return MakeGap(delta.GetLiteral().GetSeq_data(), len, ext, gap_start, gap_length);
126     else
127     {
128         if (gap_start>0)
129         {
130             CloneLiteral(delta, ext, gap_start);
131         }
132         CDelta_seq& new_gap = ext.AddLiteral(gap_length);
133         if (len>gap_start+gap_length)
134             CloneLiteral(delta, ext, len-gap_start-gap_length);
135         return CRef<CDelta_seq>(&new_gap);
136     }
137 }
138 
139 
MakeGap(CBioseq::TInst & inst,TSeqPos gap_start,TSeqPos gap_length)140 CRef<CDelta_seq> MakeGap(CBioseq::TInst& inst, TSeqPos gap_start, TSeqPos gap_length)
141 {
142     if (inst.IsAa())
143         return CRef<CDelta_seq>();
144 
145     if (inst.IsSetExt()) {
146         CDelta_ext& ext = inst.SetExt().SetDelta();
147 
148         TSeqPos current = 0;
149         // let's find addressed literal
150         NON_CONST_ITERATE(CDelta_ext::Tdata, it, ext.Set())
151         {
152             CRef<CDelta_seq> delta = *it;
153             if (delta->IsLiteral())
154             {
155                 // literal without length specified break the locup
156                 if (!delta->GetLiteral().IsSetLength())
157                     return CRef<CDelta_seq>();
158 
159                 TSeqPos lit_len = delta->GetLiteral().GetLength();
160 
161                 if (gap_start < lit_len)
162                 {
163                     if (gap_start == 0 && lit_len == gap_length) // && !delta->GetLiteral().IsSetSeq_data())
164                         return delta; // simply return the delta, don't create new
165                     else
166                     {
167                         CDelta_ext new_ext; // temporal container
168                         CRef<CDelta_seq> gap_seq = MakeGap(*delta, lit_len, new_ext, gap_start, gap_length);
169                         if (gap_seq)
170                         {
171                             // replace the current delta with created
172                             it = ext.Set().erase(it);
173                             ext.Set().insert(it, new_ext.Set().begin(), new_ext.Set().end());
174                         }
175                         return gap_seq;
176                     }
177                 }
178                 current += lit_len;
179                 if (lit_len > gap_start)
180                     return CRef<CDelta_seq>();
181                 gap_start -= lit_len;
182             }
183             else
184             {
185                 return CRef<CDelta_seq>();
186             }
187         }
188     }
189     else
190     if (inst.IsSetSeq_data())
191     {
192         // convert simple sequence into delta seq
193         const CSeq_data& data = inst.GetSeq_data();
194 
195         CDelta_ext& ext = inst.SetExt().SetDelta();
196 
197         CRef<CDelta_seq> delta = MakeGap(data, inst.GetLength(), ext, gap_start, gap_length);
198 
199         // finalize, if delta seq was not successfull
200         // revert to the original
201         if (ext.Get().size() > 1) {
202             inst.SetRepr(CSeq_inst::eRepr_delta);
203             inst.ResetSeq_data();
204         } else { // roll back
205             inst.ResetExt();
206         }
207         return delta;
208     }
209 
210     return CRef<CDelta_seq>();
211 
212 }
213 
214 }
215 
CGapsEditor(CSeq_gap::EType gap_type,const TEvidenceSet & evidences,TSeqPos gapNmin,TSeqPos gap_Unknown_length)216 CGapsEditor::CGapsEditor(CSeq_gap::EType gap_type, const TEvidenceSet& evidences,
217     TSeqPos gapNmin, TSeqPos gap_Unknown_length) :
218     m_gap_type(gap_type),
219     m_DefaultEvidence(evidences),
220     m_gapNmin(gapNmin),
221     m_gap_Unknown_length(gap_Unknown_length)
222 {
223 }
224 
225 
CGapsEditor(CSeq_gap::EType gap_type,const TEvidenceSet & defaultEvidence,const TCountToEvidenceMap & countToEvidenceMap,TSeqPos gapNmin,TSeqPos gap_Unknown_length)226 CGapsEditor::CGapsEditor(CSeq_gap::EType gap_type,
227         const TEvidenceSet& defaultEvidence,
228         const TCountToEvidenceMap& countToEvidenceMap,
229         TSeqPos gapNmin,
230         TSeqPos gap_Unknown_length)
231     :
232     m_gap_type(gap_type),
233     m_DefaultEvidence(defaultEvidence),
234     m_GapsizeToEvidenceMap(countToEvidenceMap),
235     m_gapNmin(gapNmin),
236     m_gap_Unknown_length(gap_Unknown_length)
237 {
238 }
239 
240 
241 CRef<CDelta_seq>
CreateGap(CBioseq & bioseq,TSeqPos gap_start,TSeqPos gap_length)242 CGapsEditor::CreateGap(CBioseq& bioseq, TSeqPos gap_start, TSeqPos gap_length)
243 {
244     return CreateGap(bioseq, gap_start, gap_length, gap_length==m_gap_Unknown_length);
245 }
246 
247 
248 CRef<CDelta_seq>
CreateGap(CBioseq & bioseq,TSeqPos gap_start,TSeqPos nominal_length,bool length_unknown)249 CGapsEditor::CreateGap(CBioseq& bioseq, TSeqPos gap_start, TSeqPos nominal_length, bool length_unknown)
250 {
251     if (!bioseq.IsSetInst())
252         return CRef<CDelta_seq>();
253 
254     CRef<CDelta_seq> seq = MakeGap(bioseq.SetInst(), gap_start, nominal_length);
255     if (seq.NotEmpty())
256     {
257         seq->SetLiteral();
258         x_SetGapParameters(*seq, length_unknown);
259     }
260     return seq;
261 }
262 
263 
ConvertNs2Gaps(const CSeq_data & data,TSeqPos len,CDelta_ext & ext)264 void  CGapsEditor::ConvertNs2Gaps(const CSeq_data& data, TSeqPos len, CDelta_ext& ext)
265 {
266     string decoded;
267     if (!Make_Iupacna(data, decoded, len))
268         return;
269 
270     {
271         size_t index = 0;
272         CTempString current(decoded);
273         size_t start;
274         while (index + m_gapNmin <= current.length() && ((start = current.find_first_of("Nn", index)) != CTempString::npos))
275         {
276             size_t end = current.find_first_not_of("Nn", start);
277             if (end == CTempString::npos)
278                 end = current.length();
279             if (end - start >= m_gapNmin)
280             {
281                 if (start > 0)
282                     ext.AddAndSplit(current, CSeq_data::e_Iupacna, TSeqPos(start), false, true);
283 
284                 CDelta_seq& gap = ext.AddLiteral(TSeqPos(end-start));
285                 x_SetGapParameters(gap);
286                 current.assign(current.data(), end, current.length() - end);
287                 end = 0;
288             }
289             index = end;
290         }
291         if (current.length() > 0)
292             ext.AddAndSplit(current, CSeq_data::e_Iupacna, TSeqPos(current.length()), false, true);
293     }
294 }
295 
ConvertNs2Gaps(CBioseq::TInst & inst)296 void CGapsEditor::ConvertNs2Gaps(CBioseq::TInst& inst)
297 {
298     if (inst.IsAa()  ||  !inst.IsSetSeq_data()  ||  inst.IsSetExt()) {
299         return;
300     }
301     const CSeq_data& data = inst.GetSeq_data();
302 
303     CDelta_ext& ext = inst.SetExt().SetDelta();
304 
305     ConvertNs2Gaps(data, inst.GetLength(), ext);
306 
307     if (ext.Get().size() > 1) { // finalize
308         inst.SetRepr(CSeq_inst::eRepr_delta);
309         inst.ResetSeq_data();
310     } else { // roll back
311         inst.ResetExt();
312     }
313 }
314 
ConvertNs2Gaps(CBioseq & bioseq)315 void CGapsEditor::ConvertNs2Gaps(CBioseq& bioseq)
316 {
317     if (bioseq.IsSetInst() && bioseq.GetInst().IsSetSeq_data() && !bioseq.GetInst().GetSeq_data().IsGap())
318     {
319         ConvertNs2Gaps(bioseq.SetInst());
320     }
321 
322 
323     if (!bioseq.IsSetInst() || !bioseq.GetInst().IsSetExt() || !bioseq.GetInst().GetExt().IsDelta())
324     {
325         return;
326     }
327 
328     CSeq_inst& inst = bioseq.SetInst();
329 
330     // since delta functions allows adding new literal to an end we create a copy of literal array
331     // to iterate over
332 
333     CDelta_ext::Tdata src_data = inst.GetExt().GetDelta().Get();
334     CDelta_ext& dst_data = inst.SetExt().SetDelta();
335     dst_data.Set().clear();
336 
337     NON_CONST_ITERATE(CDelta_ext::Tdata, it, src_data)
338     {
339         if (!(**it).IsLiteral())
340         {
341             dst_data.Set().push_back(*it);
342             continue;
343         }
344 
345         CDelta_seq::TLiteral& lit = (**it).SetLiteral();
346 
347         // split a literal to elements if needed
348         if (lit.IsSetSeq_data() && !lit.GetSeq_data().IsGap())
349         {
350             ConvertNs2Gaps(lit.GetSeq_data(), lit.GetLength(), dst_data);
351         }
352         else
353         {
354             // otherwise add it as is and possible update some parameters
355             dst_data.Set().push_back(*it);
356             x_SetGapParameters(**it);
357         }
358     }
359 }
360 
x_SetGapParameters(CDelta_seq & lit)361 void CGapsEditor::x_SetGapParameters(CDelta_seq& lit)
362 {
363     bool length_unknown = (lit.GetLiteral().IsSetLength() &&
364                            lit.GetLiteral().GetLength() == m_gap_Unknown_length);
365 
366     x_SetGapParameters(lit, length_unknown);
367 }
368 
x_SetGapParameters(CDelta_seq & lit,bool length_unknown)369 void CGapsEditor::x_SetGapParameters(CDelta_seq& lit, bool length_unknown)
370 {
371     CDelta_seq::TLiteral& gap = lit.SetLiteral();
372     if (length_unknown) {
373         gap.SetFuzz().SetLim(CInt_fuzz::eLim_unk);
374     }
375 
376     if (gap.IsSetSeq_data() &&
377         gap.GetSeq_data().IsGap() &&
378         gap.GetSeq_data().GetGap().GetLinkage_evidence().size() > 0)
379         return;
380 
381     if (!m_DefaultEvidence.empty() || !m_GapsizeToEvidenceMap.empty()) {
382         const auto len = gap.GetLength();
383         const auto it = m_GapsizeToEvidenceMap.find(len);
384         const auto& evidenceSet =
385             (it != m_GapsizeToEvidenceMap.end()) ?
386             it->second :
387             m_DefaultEvidence;
388 
389         if (evidenceSet.empty()) {
390             return;
391         }
392 
393         for (const auto& evidence : evidenceSet) {
394             auto pEvidence = Ref(new CLinkage_evidence());
395             pEvidence->SetType(evidence);
396             gap.SetSeq_data().SetGap().SetLinkage_evidence().emplace_back(move(pEvidence));
397         }
398         gap.SetSeq_data().SetGap().SetLinkage(CSeq_gap::eLinkage_linked);
399         gap.SetSeq_data().SetGap().SetType(m_gap_type);
400     }
401 }
402 
ConvertNs2Gaps(objects::CSeq_entry & entry)403 void CGapsEditor::ConvertNs2Gaps(objects::CSeq_entry& entry)
404 {
405     if (m_gapNmin == 0 && m_gap_Unknown_length > 0)
406         return;
407 
408     switch(entry.Which())
409     {
410     case CSeq_entry::e_Seq:
411         {
412             ConvertNs2Gaps(entry.SetSeq());
413         }
414         break;
415     case CSeq_entry::e_Set:
416         NON_CONST_ITERATE(CSeq_entry::TSet::TSeq_set, it, entry.SetSet().SetSeq_set())
417         {
418             ConvertNs2Gaps(**it);
419         }
420         break;
421     default:
422         break;
423     }
424 }
425 
ConvertBioseqToDelta(CBioseq & bioseq)426 void CGapsEditor::ConvertBioseqToDelta(CBioseq& bioseq)
427 {
428     TSeqPos len = bioseq.GetInst().GetLength();
429     CDelta_ext& delta_ext = bioseq.SetInst().SetExt().SetDelta();
430     CRef<CDelta_seq> delta_seq(new CDelta_seq);
431     delta_seq->SetLiteral().SetSeq_data(bioseq.SetInst().SetSeq_data());
432     delta_seq->SetLiteral().SetLength(len);
433     delta_ext.Set().push_back(delta_seq);
434     bioseq.SetInst().ResetSeq_data();
435     bioseq.SetInst().SetRepr(CSeq_inst::eRepr_delta);
436 }
437 
AppendGap(CBioseq & bioseq)438 void CGapsEditor::AppendGap(CBioseq& bioseq)
439 {
440     CRef<CDelta_seq> delta_seq(new CDelta_seq);
441     CDelta_seq::TLiteral& lit = delta_seq->SetLiteral();
442     lit.SetLength(0);
443     x_SetGapParameters(*delta_seq);
444     const TSeqPos len = 100;
445     lit.SetLength(len);
446     bioseq.SetInst().SetExt().SetDelta().Set().push_back(delta_seq);
447     bioseq.SetInst().SetLength() += len;
448 }
449 
AddBioseqAsLiteral(CBioseq & parent,CBioseq & bioseq)450 void CGapsEditor::AddBioseqAsLiteral(CBioseq& parent, CBioseq& bioseq)
451 {
452     auto& delta_set = parent.SetInst().SetExt().SetDelta().Set();
453     if (!delta_set.empty() && delta_set.back()->GetLiteral().IsSetSeq_data())
454     {
455         AppendGap(parent);
456     }
457     if (bioseq.GetInst().IsSetExt())
458     {
459         delta_set.splice(delta_set.end(), bioseq.SetInst().SetExt().SetDelta().Set());
460     }
461     else {
462         CRef<CDelta_seq> delta_seq(new CDelta_seq);
463         delta_seq->SetLiteral().SetSeq_data(bioseq.SetInst().SetSeq_data());
464         delta_seq->SetLiteral().SetLength(bioseq.GetInst().GetLength());
465         delta_set.push_back(delta_seq);
466     }
467     parent.SetInst().SetLength() += bioseq.GetLength();
468 }
469 
470 END_SCOPE(objects)
471 END_NCBI_SCOPE
472