1 /* $Id: gaps_edit.cpp 632623 2021-06-03 17:38:11Z ivanov $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Authors: Sergiy Gotvyanskyy, NCBI
27 *
28 * File Description:
29 * Converts runs of 'N' or 'n' into a gap
30 *
31 * ===========================================================================
32 */
33
34 #include <ncbi_pch.hpp>
35
36 #include <corelib/ncbiutil.hpp>
37
38 #include <objects/seq/Bioseq.hpp>
39 #include <objects/seqset/Bioseq_set.hpp>
40 #include <objects/seqset/Seq_entry.hpp>
41 #include <objects/seq/Delta_ext.hpp>
42 #include <objects/seq/Delta_seq.hpp>
43 #include <objects/seq/Seq_gap.hpp>
44 #include <objects/seq/Seq_literal.hpp>
45 #include <objects/general/Int_fuzz.hpp>
46 #include <objects/seq/Seq_inst.hpp>
47 #include <objects/seq/Seq_ext.hpp>
48
49 #include <util/sequtil/sequtil_convert.hpp>
50
51 #include <objtools/edit/gaps_edit.hpp>
52
53 BEGIN_NCBI_SCOPE
54 BEGIN_SCOPE(objects)
55
56 namespace
57 {
58
Make_Iupacna(const CSeq_data & data,string & decoded,TSeqPos len)59 bool Make_Iupacna(const CSeq_data& data, string& decoded, TSeqPos len)
60 {
61 CSeqUtil::TCoding src_coding = CSeqUtil::e_not_set;
62 CSeqUtil::TCoding dst_coding = CSeqUtil::e_Iupacna;
63 CTempString src;
64 switch (data.Which()) {
65 #define CODING_CASE(x) \
66 case CSeq_data::e_##x: \
67 src.assign(&data.Get##x().Get()[0], data.Get##x().Get().size()); \
68 src_coding = CSeqUtil::e_##x; \
69 CSeqConvert::Convert(src, src_coding, 0, len, decoded, dst_coding); \
70 break;
71 CODING_CASE(Ncbi2na)
72 CODING_CASE(Iupacna)
73 CODING_CASE(Iupacaa)
74 CODING_CASE(Ncbi4na)
75 CODING_CASE(Ncbi8na)
76 CODING_CASE(Ncbi8aa)
77 CODING_CASE(Ncbieaa)
78 CODING_CASE(Ncbistdaa)
79 #undef CODING_CASE
80 default:
81 // ERR_POST_X(1, Warning << "PackWithGaps: unsupported encoding "
82 // << CSeq_data::SelectionName(data.Which()));
83 return false;
84 }
85 return true;
86 }
87
88
MakeGap(const CSeq_data & data,TSeqPos len,CDelta_ext & ext,TSeqPos gap_start,TSeqPos gap_length)89 CRef<CDelta_seq> MakeGap(const CSeq_data& data, TSeqPos len, CDelta_ext& ext, TSeqPos gap_start, TSeqPos gap_length)
90 {
91 string decoded;
92 if (!Make_Iupacna(data, decoded, len))
93 return CRef<CDelta_seq>();
94
95 if (gap_start>0)
96 {
97 ext.AddAndSplit(CTempString(decoded, 0, gap_start), CSeq_data::e_Iupacna, gap_start, true, true);
98 }
99
100 // here we need to check if segment is gap in fact
101 CDelta_seq& gap = ext.AddLiteral(gap_length);
102
103 if (gap_start+gap_length < decoded.length())
104 {
105 ext.AddAndSplit(
106 CTempString(decoded, gap_start+gap_length, TSeqPos(decoded.length())-gap_start-gap_length),
107 CSeq_data::e_Iupacna, TSeqPos(decoded.length())-gap_start-gap_length, true, true);
108 }
109
110 return CRef<CDelta_seq>(&gap);
111 }
112
CloneLiteral(const CDelta_seq & delta,CDelta_ext & ext,TSeqPos len)113 CDelta_seq& CloneLiteral(const CDelta_seq& delta, CDelta_ext& ext, TSeqPos len)
114 {
115 CDelta_seq& new_delta = ext.AddLiteral(len);
116 if (delta.GetLiteral().IsSetSeq_data())
117 new_delta.SetLiteral().SetSeq_data().Assign(delta.GetLiteral().GetSeq_data());
118
119 return new_delta;
120 }
121
MakeGap(const CDelta_seq & delta,TSeqPos len,CDelta_ext & ext,TSeqPos gap_start,TSeqPos gap_length)122 CRef<CDelta_seq> MakeGap(const CDelta_seq& delta, TSeqPos len, CDelta_ext& ext, TSeqPos gap_start, TSeqPos gap_length)
123 {
124 if (delta.IsLiteral() && delta.GetLiteral().IsSetSeq_data())
125 return MakeGap(delta.GetLiteral().GetSeq_data(), len, ext, gap_start, gap_length);
126 else
127 {
128 if (gap_start>0)
129 {
130 CloneLiteral(delta, ext, gap_start);
131 }
132 CDelta_seq& new_gap = ext.AddLiteral(gap_length);
133 if (len>gap_start+gap_length)
134 CloneLiteral(delta, ext, len-gap_start-gap_length);
135 return CRef<CDelta_seq>(&new_gap);
136 }
137 }
138
139
MakeGap(CBioseq::TInst & inst,TSeqPos gap_start,TSeqPos gap_length)140 CRef<CDelta_seq> MakeGap(CBioseq::TInst& inst, TSeqPos gap_start, TSeqPos gap_length)
141 {
142 if (inst.IsAa())
143 return CRef<CDelta_seq>();
144
145 if (inst.IsSetExt()) {
146 CDelta_ext& ext = inst.SetExt().SetDelta();
147
148 TSeqPos current = 0;
149 // let's find addressed literal
150 NON_CONST_ITERATE(CDelta_ext::Tdata, it, ext.Set())
151 {
152 CRef<CDelta_seq> delta = *it;
153 if (delta->IsLiteral())
154 {
155 // literal without length specified break the locup
156 if (!delta->GetLiteral().IsSetLength())
157 return CRef<CDelta_seq>();
158
159 TSeqPos lit_len = delta->GetLiteral().GetLength();
160
161 if (gap_start < lit_len)
162 {
163 if (gap_start == 0 && lit_len == gap_length) // && !delta->GetLiteral().IsSetSeq_data())
164 return delta; // simply return the delta, don't create new
165 else
166 {
167 CDelta_ext new_ext; // temporal container
168 CRef<CDelta_seq> gap_seq = MakeGap(*delta, lit_len, new_ext, gap_start, gap_length);
169 if (gap_seq)
170 {
171 // replace the current delta with created
172 it = ext.Set().erase(it);
173 ext.Set().insert(it, new_ext.Set().begin(), new_ext.Set().end());
174 }
175 return gap_seq;
176 }
177 }
178 current += lit_len;
179 if (lit_len > gap_start)
180 return CRef<CDelta_seq>();
181 gap_start -= lit_len;
182 }
183 else
184 {
185 return CRef<CDelta_seq>();
186 }
187 }
188 }
189 else
190 if (inst.IsSetSeq_data())
191 {
192 // convert simple sequence into delta seq
193 const CSeq_data& data = inst.GetSeq_data();
194
195 CDelta_ext& ext = inst.SetExt().SetDelta();
196
197 CRef<CDelta_seq> delta = MakeGap(data, inst.GetLength(), ext, gap_start, gap_length);
198
199 // finalize, if delta seq was not successfull
200 // revert to the original
201 if (ext.Get().size() > 1) {
202 inst.SetRepr(CSeq_inst::eRepr_delta);
203 inst.ResetSeq_data();
204 } else { // roll back
205 inst.ResetExt();
206 }
207 return delta;
208 }
209
210 return CRef<CDelta_seq>();
211
212 }
213
214 }
215
CGapsEditor(CSeq_gap::EType gap_type,const TEvidenceSet & evidences,TSeqPos gapNmin,TSeqPos gap_Unknown_length)216 CGapsEditor::CGapsEditor(CSeq_gap::EType gap_type, const TEvidenceSet& evidences,
217 TSeqPos gapNmin, TSeqPos gap_Unknown_length) :
218 m_gap_type(gap_type),
219 m_DefaultEvidence(evidences),
220 m_gapNmin(gapNmin),
221 m_gap_Unknown_length(gap_Unknown_length)
222 {
223 }
224
225
CGapsEditor(CSeq_gap::EType gap_type,const TEvidenceSet & defaultEvidence,const TCountToEvidenceMap & countToEvidenceMap,TSeqPos gapNmin,TSeqPos gap_Unknown_length)226 CGapsEditor::CGapsEditor(CSeq_gap::EType gap_type,
227 const TEvidenceSet& defaultEvidence,
228 const TCountToEvidenceMap& countToEvidenceMap,
229 TSeqPos gapNmin,
230 TSeqPos gap_Unknown_length)
231 :
232 m_gap_type(gap_type),
233 m_DefaultEvidence(defaultEvidence),
234 m_GapsizeToEvidenceMap(countToEvidenceMap),
235 m_gapNmin(gapNmin),
236 m_gap_Unknown_length(gap_Unknown_length)
237 {
238 }
239
240
241 CRef<CDelta_seq>
CreateGap(CBioseq & bioseq,TSeqPos gap_start,TSeqPos gap_length)242 CGapsEditor::CreateGap(CBioseq& bioseq, TSeqPos gap_start, TSeqPos gap_length)
243 {
244 return CreateGap(bioseq, gap_start, gap_length, gap_length==m_gap_Unknown_length);
245 }
246
247
248 CRef<CDelta_seq>
CreateGap(CBioseq & bioseq,TSeqPos gap_start,TSeqPos nominal_length,bool length_unknown)249 CGapsEditor::CreateGap(CBioseq& bioseq, TSeqPos gap_start, TSeqPos nominal_length, bool length_unknown)
250 {
251 if (!bioseq.IsSetInst())
252 return CRef<CDelta_seq>();
253
254 CRef<CDelta_seq> seq = MakeGap(bioseq.SetInst(), gap_start, nominal_length);
255 if (seq.NotEmpty())
256 {
257 seq->SetLiteral();
258 x_SetGapParameters(*seq, length_unknown);
259 }
260 return seq;
261 }
262
263
ConvertNs2Gaps(const CSeq_data & data,TSeqPos len,CDelta_ext & ext)264 void CGapsEditor::ConvertNs2Gaps(const CSeq_data& data, TSeqPos len, CDelta_ext& ext)
265 {
266 string decoded;
267 if (!Make_Iupacna(data, decoded, len))
268 return;
269
270 {
271 size_t index = 0;
272 CTempString current(decoded);
273 size_t start;
274 while (index + m_gapNmin <= current.length() && ((start = current.find_first_of("Nn", index)) != CTempString::npos))
275 {
276 size_t end = current.find_first_not_of("Nn", start);
277 if (end == CTempString::npos)
278 end = current.length();
279 if (end - start >= m_gapNmin)
280 {
281 if (start > 0)
282 ext.AddAndSplit(current, CSeq_data::e_Iupacna, TSeqPos(start), false, true);
283
284 CDelta_seq& gap = ext.AddLiteral(TSeqPos(end-start));
285 x_SetGapParameters(gap);
286 current.assign(current.data(), end, current.length() - end);
287 end = 0;
288 }
289 index = end;
290 }
291 if (current.length() > 0)
292 ext.AddAndSplit(current, CSeq_data::e_Iupacna, TSeqPos(current.length()), false, true);
293 }
294 }
295
ConvertNs2Gaps(CBioseq::TInst & inst)296 void CGapsEditor::ConvertNs2Gaps(CBioseq::TInst& inst)
297 {
298 if (inst.IsAa() || !inst.IsSetSeq_data() || inst.IsSetExt()) {
299 return;
300 }
301 const CSeq_data& data = inst.GetSeq_data();
302
303 CDelta_ext& ext = inst.SetExt().SetDelta();
304
305 ConvertNs2Gaps(data, inst.GetLength(), ext);
306
307 if (ext.Get().size() > 1) { // finalize
308 inst.SetRepr(CSeq_inst::eRepr_delta);
309 inst.ResetSeq_data();
310 } else { // roll back
311 inst.ResetExt();
312 }
313 }
314
ConvertNs2Gaps(CBioseq & bioseq)315 void CGapsEditor::ConvertNs2Gaps(CBioseq& bioseq)
316 {
317 if (bioseq.IsSetInst() && bioseq.GetInst().IsSetSeq_data() && !bioseq.GetInst().GetSeq_data().IsGap())
318 {
319 ConvertNs2Gaps(bioseq.SetInst());
320 }
321
322
323 if (!bioseq.IsSetInst() || !bioseq.GetInst().IsSetExt() || !bioseq.GetInst().GetExt().IsDelta())
324 {
325 return;
326 }
327
328 CSeq_inst& inst = bioseq.SetInst();
329
330 // since delta functions allows adding new literal to an end we create a copy of literal array
331 // to iterate over
332
333 CDelta_ext::Tdata src_data = inst.GetExt().GetDelta().Get();
334 CDelta_ext& dst_data = inst.SetExt().SetDelta();
335 dst_data.Set().clear();
336
337 NON_CONST_ITERATE(CDelta_ext::Tdata, it, src_data)
338 {
339 if (!(**it).IsLiteral())
340 {
341 dst_data.Set().push_back(*it);
342 continue;
343 }
344
345 CDelta_seq::TLiteral& lit = (**it).SetLiteral();
346
347 // split a literal to elements if needed
348 if (lit.IsSetSeq_data() && !lit.GetSeq_data().IsGap())
349 {
350 ConvertNs2Gaps(lit.GetSeq_data(), lit.GetLength(), dst_data);
351 }
352 else
353 {
354 // otherwise add it as is and possible update some parameters
355 dst_data.Set().push_back(*it);
356 x_SetGapParameters(**it);
357 }
358 }
359 }
360
x_SetGapParameters(CDelta_seq & lit)361 void CGapsEditor::x_SetGapParameters(CDelta_seq& lit)
362 {
363 bool length_unknown = (lit.GetLiteral().IsSetLength() &&
364 lit.GetLiteral().GetLength() == m_gap_Unknown_length);
365
366 x_SetGapParameters(lit, length_unknown);
367 }
368
x_SetGapParameters(CDelta_seq & lit,bool length_unknown)369 void CGapsEditor::x_SetGapParameters(CDelta_seq& lit, bool length_unknown)
370 {
371 CDelta_seq::TLiteral& gap = lit.SetLiteral();
372 if (length_unknown) {
373 gap.SetFuzz().SetLim(CInt_fuzz::eLim_unk);
374 }
375
376 if (gap.IsSetSeq_data() &&
377 gap.GetSeq_data().IsGap() &&
378 gap.GetSeq_data().GetGap().GetLinkage_evidence().size() > 0)
379 return;
380
381 if (!m_DefaultEvidence.empty() || !m_GapsizeToEvidenceMap.empty()) {
382 const auto len = gap.GetLength();
383 const auto it = m_GapsizeToEvidenceMap.find(len);
384 const auto& evidenceSet =
385 (it != m_GapsizeToEvidenceMap.end()) ?
386 it->second :
387 m_DefaultEvidence;
388
389 if (evidenceSet.empty()) {
390 return;
391 }
392
393 for (const auto& evidence : evidenceSet) {
394 auto pEvidence = Ref(new CLinkage_evidence());
395 pEvidence->SetType(evidence);
396 gap.SetSeq_data().SetGap().SetLinkage_evidence().emplace_back(move(pEvidence));
397 }
398 gap.SetSeq_data().SetGap().SetLinkage(CSeq_gap::eLinkage_linked);
399 gap.SetSeq_data().SetGap().SetType(m_gap_type);
400 }
401 }
402
ConvertNs2Gaps(objects::CSeq_entry & entry)403 void CGapsEditor::ConvertNs2Gaps(objects::CSeq_entry& entry)
404 {
405 if (m_gapNmin == 0 && m_gap_Unknown_length > 0)
406 return;
407
408 switch(entry.Which())
409 {
410 case CSeq_entry::e_Seq:
411 {
412 ConvertNs2Gaps(entry.SetSeq());
413 }
414 break;
415 case CSeq_entry::e_Set:
416 NON_CONST_ITERATE(CSeq_entry::TSet::TSeq_set, it, entry.SetSet().SetSeq_set())
417 {
418 ConvertNs2Gaps(**it);
419 }
420 break;
421 default:
422 break;
423 }
424 }
425
ConvertBioseqToDelta(CBioseq & bioseq)426 void CGapsEditor::ConvertBioseqToDelta(CBioseq& bioseq)
427 {
428 TSeqPos len = bioseq.GetInst().GetLength();
429 CDelta_ext& delta_ext = bioseq.SetInst().SetExt().SetDelta();
430 CRef<CDelta_seq> delta_seq(new CDelta_seq);
431 delta_seq->SetLiteral().SetSeq_data(bioseq.SetInst().SetSeq_data());
432 delta_seq->SetLiteral().SetLength(len);
433 delta_ext.Set().push_back(delta_seq);
434 bioseq.SetInst().ResetSeq_data();
435 bioseq.SetInst().SetRepr(CSeq_inst::eRepr_delta);
436 }
437
AppendGap(CBioseq & bioseq)438 void CGapsEditor::AppendGap(CBioseq& bioseq)
439 {
440 CRef<CDelta_seq> delta_seq(new CDelta_seq);
441 CDelta_seq::TLiteral& lit = delta_seq->SetLiteral();
442 lit.SetLength(0);
443 x_SetGapParameters(*delta_seq);
444 const TSeqPos len = 100;
445 lit.SetLength(len);
446 bioseq.SetInst().SetExt().SetDelta().Set().push_back(delta_seq);
447 bioseq.SetInst().SetLength() += len;
448 }
449
AddBioseqAsLiteral(CBioseq & parent,CBioseq & bioseq)450 void CGapsEditor::AddBioseqAsLiteral(CBioseq& parent, CBioseq& bioseq)
451 {
452 auto& delta_set = parent.SetInst().SetExt().SetDelta().Set();
453 if (!delta_set.empty() && delta_set.back()->GetLiteral().IsSetSeq_data())
454 {
455 AppendGap(parent);
456 }
457 if (bioseq.GetInst().IsSetExt())
458 {
459 delta_set.splice(delta_set.end(), bioseq.SetInst().SetExt().SetDelta().Set());
460 }
461 else {
462 CRef<CDelta_seq> delta_seq(new CDelta_seq);
463 delta_seq->SetLiteral().SetSeq_data(bioseq.SetInst().SetSeq_data());
464 delta_seq->SetLiteral().SetLength(bioseq.GetInst().GetLength());
465 delta_set.push_back(delta_seq);
466 }
467 parent.SetInst().SetLength() += bioseq.GetLength();
468 }
469
470 END_SCOPE(objects)
471 END_NCBI_SCOPE
472