1 /* $Id: aln_writer.cpp 634268 2021-07-07 15:42:01Z ivanov $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Authors: Justin Foley
27 *
28 * File Description: Write alignment
29 *
30 */
31
32 #include <ncbi_pch.hpp>
33
34 #include <objects/seqalign/Seq_align.hpp>
35 #include <objects/seqalign/Dense_seg.hpp>
36 #include <objects/seqalign/Spliced_seg.hpp>
37 #include <objects/seqalign/Spliced_exon.hpp>
38 #include <objects/seqalign/Sparse_seg.hpp>
39 #include <objects/seqalign/Sparse_align.hpp>
40 #include <objects/seqalign/Product_pos.hpp>
41 #include <objects/seqalign/Prot_pos.hpp>
42 #include <objects/seqalign/Spliced_exon_chunk.hpp>
43 #include <objects/general/Object_id.hpp>
44 #include <objects/seq/seq_id_handle.hpp>
45
46 #include <objmgr/scope.hpp>
47 #include <objmgr/bioseq_handle.hpp>
48 #include <objmgr/seq_vector.hpp>
49 #include <objmgr/util/sequence.hpp>
50
51 #include <objtools/writers/writer_exception.hpp>
52 #include <objtools/writers/write_util.hpp>
53 #include <objtools/writers/aln_writer.hpp>
54
55 #include <util/sequtil/sequtil_manip.hpp>
56
57 BEGIN_NCBI_SCOPE
58 USING_SCOPE(objects);
59
60 // ----------------------------------------------------------------------------
CAlnWriter(CScope & scope,CNcbiOstream & ostr,unsigned int uFlags)61 CAlnWriter::CAlnWriter(
62 CScope& scope,
63 CNcbiOstream& ostr,
64 unsigned int uFlags) :
65 CWriterBase(ostr, uFlags)
66 {
67 m_pScope.Reset(&scope);
68 m_Width = 60;
69 CGenbankIdResolve::Get().SetLabelType(CSeq_id::eFasta);
70 };
71
72
73 // ----------------------------------------------------------------------------
74
CAlnWriter(CNcbiOstream & ostr,unsigned int uFlags)75 CAlnWriter::CAlnWriter(
76 CNcbiOstream& ostr,
77 unsigned int uFlags) :
78 CAlnWriter(*(new CScope(*CObjectManager::GetInstance())), ostr, uFlags)
79 {
80 };
81
82
83 // ----------------------------------------------------------------------------
WriteAlign(const CSeq_align & align,const string & name,const string & descr)84 bool CAlnWriter::WriteAlign(
85 const CSeq_align& align,
86 const string& name,
87 const string& descr)
88 {
89
90 switch (align.GetSegs().Which()) {
91 case CSeq_align::C_Segs::e_Denseg:
92 return WriteAlignDenseSeg(align.GetSegs().GetDenseg());
93 case CSeq_align::C_Segs::e_Spliced:
94 return WriteAlignSplicedSeg(align.GetSegs().GetSpliced());
95 case CSeq_align::C_Segs::e_Sparse:
96 return WriteAlignSparseSeg(align.GetSegs().GetSparse());
97 case CSeq_align::C_Segs::e_Std:
98 break;
99 default:
100 break;
101 }
102
103 return false;
104 }
105 // ----------------------------------------------------------------------------
106
ProcessSeqId(const CSeq_id & id,CBioseq_Handle & bsh,CRange<TSeqPos> & range)107 void CAlnWriter::ProcessSeqId(const CSeq_id& id, CBioseq_Handle& bsh, CRange<TSeqPos>& range)
108 {
109 if (m_pScope) {
110
111 bsh = m_pScope->GetBioseqHandle(id);
112 range.SetFrom(CRange<TSeqPos>::GetPositionMin());
113 range.SetToOpen(CRange<TSeqPos>::GetPositionMax());
114 }
115 }
116
117
118 // -----------------------------------------------------------------------------
119
GetSeqString(CBioseq_Handle bsh,const CRange<TSeqPos> & range,ENa_strand strand,string & seq)120 void CAlnWriter::GetSeqString(CBioseq_Handle bsh,
121 const CRange<TSeqPos>& range,
122 ENa_strand strand,
123 string& seq)
124 {
125 if (!bsh) {
126 NCBI_THROW(CObjWriterException,
127 eBadInput,
128 "Empty bioseq handle");
129 }
130
131 CSeqVector seq_vec = bsh.GetSeqVector(CBioseq_Handle::eCoding_Iupac, strand);
132 if (range.IsWhole()) {
133 seq_vec.GetSeqData(0, bsh.GetBioseqLength(), seq);
134 }
135 else {
136 seq_vec.GetSeqData(range.GetFrom(), range.GetTo(), seq);
137 }
138
139 if (NStr::IsBlank(seq)) {
140 NCBI_THROW(CObjWriterException,
141 eBadInput,
142 "Empty sequence string");
143 }
144 }
145
146 // -----------------------------------------------------------------------------
147
WriteAlignDenseSeg(const CDense_seg & denseg)148 bool CAlnWriter::WriteAlignDenseSeg(
149 const CDense_seg& denseg)
150 {
151 if (!denseg.CanGetDim() ||
152 !denseg.CanGetNumseg() ||
153 !denseg.CanGetIds() ||
154 !denseg.CanGetStarts() ||
155 !denseg.CanGetLens())
156 {
157 return false;
158 }
159
160 const auto num_rows = denseg.GetDim();
161 const auto num_segs = denseg.GetNumseg();
162
163
164 for (int row=0; row<num_rows; ++row)
165 {
166 const CSeq_id& id = denseg.GetSeq_id(row);
167 CRange<TSeqPos> range;
168
169 CBioseq_Handle bsh;
170 ProcessSeqId(id, bsh, range);
171 if (!bsh) {
172 NCBI_THROW(CObjWriterException,
173 eBadInput,
174 "Unable to fetch Bioseq");
175 }
176
177 string seq_plus;
178 GetSeqString(bsh, range, eNa_strand_plus, seq_plus);
179
180 const CSeqUtil::ECoding coding =
181 (bsh.IsNucleotide()) ?
182 CSeqUtil::e_Iupacna :
183 CSeqUtil::e_Iupacaa;
184
185 string seqdata;
186 for (int seg=0; seg<num_segs; ++seg)
187 {
188 const auto start = denseg.GetStarts()[seg*num_rows + row];
189 const auto len = denseg.GetLens()[seg];
190 const ENa_strand strand = (denseg.IsSetStrands()) ?
191 denseg.GetStrands()[seg*num_rows + row] :
192 eNa_strand_plus;
193 seqdata += GetSegString(seq_plus, coding, strand, start, len);
194 }
195
196 string best_id = GetBestId(id);
197 string defline = ">" + best_id;
198 WriteContiguous(defline, seqdata);
199 }
200
201 return true;
202 }
203
204 // -----------------------------------------------------------------------------
205
WriteAlignSplicedSeg(const CSpliced_seg & spliced_seg)206 bool CAlnWriter::WriteAlignSplicedSeg(
207 const CSpliced_seg& spliced_seg)
208 {
209 if (!spliced_seg.IsSetExons()) {
210 return false;
211 }
212
213 CRef<CSeq_id> genomic_id;
214 if (spliced_seg.IsSetGenomic_id()) {
215 genomic_id = Ref(new CSeq_id());
216 genomic_id->Assign(spliced_seg.GetGenomic_id());
217 }
218
219
220 CRef<CSeq_id> product_id;
221 if (spliced_seg.IsSetGenomic_id()) {
222 product_id = Ref(new CSeq_id());
223 product_id->Assign(spliced_seg.GetProduct_id());
224 }
225
226
227 ENa_strand genomic_strand =
228 spliced_seg.IsSetGenomic_strand() ?
229 spliced_seg.GetGenomic_strand() :
230 eNa_strand_plus;
231
232
233 ENa_strand product_strand =
234 spliced_seg.IsSetProduct_strand() ?
235 spliced_seg.GetProduct_strand() :
236 eNa_strand_plus;
237
238 return WriteSplicedExons(spliced_seg.GetExons(),
239 spliced_seg.GetProduct_type(),
240 genomic_id,
241 genomic_strand,
242 product_id,
243 product_strand);
244 }
245
s_ProductLength(const CProduct_pos & start,const CProduct_pos & end)246 unsigned int s_ProductLength(const CProduct_pos& start, const CProduct_pos& end)
247 {
248 if (start.Which() != end.Which()) {
249 NCBI_THROW(CObjWriterException,
250 eBadInput,
251 "Unable to determine product length");
252 }
253
254 if (start.Which() == CProduct_pos::e_not_set) {
255 NCBI_THROW(CObjWriterException,
256 eBadInput,
257 "Unable to determine product length");
258 }
259
260 const int length = end.AsSeqPos() - start.AsSeqPos();
261
262 return (length >= 0) ? length : -length;
263 }
264
265
266 // -----------------------------------------------------------------------------
WriteSplicedExons(const list<CRef<CSpliced_exon>> & exons,CSpliced_seg::TProduct_type product_type,CRef<CSeq_id> default_genomic_id,ENa_strand default_genomic_strand,CRef<CSeq_id> default_product_id,ENa_strand default_product_strand)267 bool CAlnWriter::WriteSplicedExons(const list<CRef<CSpliced_exon>>& exons,
268 CSpliced_seg::TProduct_type product_type,
269 CRef<CSeq_id> default_genomic_id, // May be NULL
270 ENa_strand default_genomic_strand,
271 CRef<CSeq_id> default_product_id,
272 ENa_strand default_product_strand)
273 {
274 string prev_genomic_id;
275 string prev_product_id;
276 for (const CRef<CSpliced_exon>& exon : exons) {
277
278 const CSeq_id& genomic_id =
279 exon->IsSetGenomic_id() ?
280 exon->GetGenomic_id() :
281 *default_genomic_id;
282
283 const CSeq_id& product_id =
284 exon->IsSetProduct_id() ?
285 exon->GetProduct_id() :
286 *default_product_id;
287
288 // Should check to see that the ids are not empty
289 const ENa_strand genomic_strand =
290 exon->IsSetGenomic_strand() ?
291 exon->GetGenomic_strand() :
292 default_genomic_strand;
293
294 const ENa_strand product_strand =
295 exon->IsSetProduct_strand() ?
296 exon->GetProduct_strand() :
297 default_product_strand;
298
299 const auto genomic_start = exon->GetGenomic_start();
300 const auto genomic_end = exon->GetGenomic_end();
301
302 if (genomic_end < genomic_start) {
303 NCBI_THROW(CObjWriterException,
304 eBadInput,
305 "Bad genomic location: end < start");
306 }
307 const int genomic_length = genomic_end - genomic_start;
308
309 const int product_start = exon->GetProduct_start().AsSeqPos();
310 const int product_end = exon->GetProduct_end().AsSeqPos();
311
312
313 if (product_end < product_start) {
314 NCBI_THROW(CObjWriterException,
315 eBadInput,
316 "Bad product location: end < start");
317 }
318 // product_length is now given in nucleotide units
319 const int product_length = product_end - product_start;
320
321 CBioseq_Handle genomic_bsh;
322 if (m_pScope) {
323 genomic_bsh = m_pScope->GetBioseqHandle(genomic_id);
324 }
325 if (!genomic_bsh) {
326 NCBI_THROW(CObjWriterException,
327 eBadInput,
328 "Unable to resolve genomic sequence");
329 }
330 CRange<TSeqPos> genomic_range(genomic_start, genomic_end+1);
331 string genomic_seq;
332 GetSeqString(genomic_bsh, genomic_range, genomic_strand, genomic_seq);
333
334 CBioseq_Handle product_bsh;
335 if (m_pScope) {
336 product_bsh = m_pScope->GetBioseqHandle(product_id);
337 }
338 if (!product_bsh) {
339 NCBI_THROW(CObjWriterException,
340 eBadInput,
341 "Unable to resolve product sequence");
342 }
343 CRange<TSeqPos> product_range(product_start, product_end+1);
344 string product_seq;
345 GetSeqString(product_bsh, product_range, product_strand, product_seq);
346
347 if (exon->IsSetParts()) {
348 AddGaps(product_type, exon->GetParts(), genomic_seq, product_seq);
349 }
350 else
351 if (product_length != genomic_length) {
352 NCBI_THROW(CObjWriterException,
353 eBadInput,
354 "Lengths of genomic and product sequences don't match");
355 }
356
357 WriteContiguous(">" + GetBestId(genomic_id), genomic_seq);
358 WriteContiguous(">" + GetBestId(product_id), product_seq);
359 }
360
361 return true;
362 }
363
364 // -----------------------------------------------------------------------------
365
AddGaps(CSpliced_seg::TProduct_type product_type,const CSpliced_exon::TParts & exon_chunks,string & genomic_seq,string & product_seq)366 void CAlnWriter::AddGaps(
367 CSpliced_seg::TProduct_type product_type,
368 const CSpliced_exon::TParts& exon_chunks,
369 string& genomic_seq,
370 string& product_seq)
371 {
372
373 if (exon_chunks.empty()) {
374 return;
375 }
376
377 string genomic_string;
378 string product_string;
379
380 const unsigned int res_width =
381 (product_type == CSpliced_seg::eProduct_type_transcript) ?
382 1 : 3;
383
384
385 // Check that match + mismatch + diag + genomic_ins = genomic_length
386 int genomic_pos = 0;
387 int product_pos = 0;
388 unsigned int interval_width = 0;
389 for (CRef<CSpliced_exon_chunk> exon_chunk : exon_chunks) {
390 auto chunk_type = exon_chunk->Which();
391 switch(chunk_type) {
392 case CSpliced_exon_chunk::e_Match:
393 case CSpliced_exon_chunk::e_Mismatch:
394 case CSpliced_exon_chunk::e_Diag:
395 switch(chunk_type) {
396 default:
397 // compiler, shut up!
398 break;
399 case CSpliced_exon_chunk::e_Match:
400 interval_width = exon_chunk->GetMatch();
401 break;
402 case CSpliced_exon_chunk::e_Mismatch:
403 interval_width = exon_chunk->GetMismatch();
404 break;
405 case CSpliced_exon_chunk::e_Diag:
406 interval_width = exon_chunk->GetDiag();
407 break;
408 }
409 genomic_string.append(genomic_seq, genomic_pos, interval_width);
410 product_string.append(product_seq, product_pos, (interval_width + (res_width-1))/res_width);
411 genomic_pos += interval_width;
412 product_pos += interval_width/res_width;
413 break;
414
415 case CSpliced_exon_chunk::e_Genomic_ins:
416 interval_width = exon_chunk->GetGenomic_ins();
417 genomic_string.append(genomic_seq, genomic_pos, interval_width);
418 product_string.append(interval_width/res_width, '-');
419 genomic_pos += interval_width;
420 break;
421
422 case CSpliced_exon_chunk::e_Product_ins:
423 interval_width = exon_chunk->GetProduct_ins();
424 genomic_string.append(interval_width, '-');
425 product_string.append(product_seq, product_pos, interval_width/res_width);
426 product_pos += interval_width/res_width;
427 break;
428 default:
429 break;
430 }
431 }
432 genomic_seq = genomic_string;
433 product_seq = product_string;
434 }
435
436
437 // -----------------------------------------------------------------------------
438
GetSegString(const string & seq_plus,CSeqUtil::ECoding coding,const ENa_strand strand,const int start,const size_t len)439 string CAlnWriter::GetSegString(const string& seq_plus,
440 CSeqUtil::ECoding coding,
441 const ENa_strand strand,
442 const int start,
443 const size_t len)
444 {
445 if (start >= 0) {
446 if (start >= seq_plus.size()) {
447 NCBI_THROW(CObjWriterException,
448 eBadInput,
449 "Bad location: impossible start");
450 }
451 if (strand != eNa_strand_minus) {
452 return seq_plus.substr(start, len);
453 }
454 // else
455 string seq_minus;
456 CSeqManip::ReverseComplement(seq_plus, coding, start, len, seq_minus);
457 return seq_minus;
458 }
459
460 return string(len, '-');
461 }
462
463
464 // -----------------------------------------------------------------------------
WriteAlignSparseSeg(const CSparse_seg & sparse_seg)465 bool CAlnWriter::WriteAlignSparseSeg(
466 const CSparse_seg& sparse_seg)
467 {
468 for (CRef<CSparse_align> align : sparse_seg.GetRows())
469 {
470 if (!WriteSparseAlign(*align)) {
471 return false;
472 }
473 }
474 return true;
475 }
476
477
478
479 // -----------------------------------------------------------------------------
480
WriteSparseAlign(const CSparse_align & sparse_align)481 bool CAlnWriter::WriteSparseAlign(const CSparse_align& sparse_align)
482 {
483 const auto num_segs = sparse_align.GetNumseg();
484
485 {
486 const CSeq_id& first_id = sparse_align.GetFirst_id();
487 CBioseq_Handle bsh;
488 CRange<TSeqPos> range;
489 ProcessSeqId(first_id, bsh, range);
490 if (!bsh) {
491 NCBI_THROW(CObjWriterException,
492 eBadInput,
493 "Unable to resolve ID " + first_id.AsFastaString());
494 }
495 const CSeqUtil::ECoding coding =
496 (bsh.IsNucleotide())?
497 CSeqUtil::e_Iupacna :
498 CSeqUtil::e_Iupacaa;
499
500 string seq_plus;
501 GetSeqString(bsh, range, eNa_strand_plus, seq_plus);
502
503 string seqdata;
504 for (int seg=0; seg<num_segs; ++seg) {
505 const auto start = sparse_align.GetFirst_starts()[seg];
506 const auto len = sparse_align.GetLens()[seg];
507 seqdata += GetSegString(seq_plus, coding, eNa_strand_plus, start, len);
508 }
509 WriteContiguous(">" + GetBestId(first_id), seqdata);
510 }
511
512 { // Second row
513 const CSeq_id& second_id = sparse_align.GetSecond_id();
514 CBioseq_Handle bsh;
515 CRange<TSeqPos> range;
516 ProcessSeqId(second_id, bsh, range);
517 if (!bsh) {
518 NCBI_THROW(CObjWriterException,
519 eBadInput,
520 "Unable to resolve ID " + second_id.AsFastaString());
521 }
522 const CSeqUtil::ECoding coding =
523 (bsh.IsNucleotide())?
524 CSeqUtil::e_Iupacna :
525 CSeqUtil::e_Iupacaa;
526
527 string seq_plus;
528 GetSeqString(bsh, range, eNa_strand_plus, seq_plus);
529
530 string seqdata;
531 const vector<ENa_strand>& strands = sparse_align.IsSetSecond_strands() ?
532 sparse_align.GetSecond_strands() : vector<ENa_strand>(num_segs, eNa_strand_plus);
533 for (int seg=0; seg<num_segs; ++seg) {
534 const auto start = sparse_align.GetFirst_starts()[seg];
535 const auto len = sparse_align.GetLens()[seg];
536 seqdata += GetSegString(seq_plus, coding, strands[seg], start, len);
537 }
538
539 WriteContiguous(">" + GetBestId(second_id), seqdata);
540 }
541 return true;
542 }
543
544 // -----------------------------------------------------------------------------
545
WriteContiguous(const string & defline,const string & seqdata)546 void CAlnWriter::WriteContiguous(const string& defline, const string& seqdata)
547 {
548 if (defline.back() == '|' && defline.size() > 1)
549 {
550 const auto length = defline.size();
551 m_Os << defline.substr(0,length-1) << "\n";
552 }
553 else {
554 m_Os << defline << "\n";
555 }
556 size_t pos=0;
557 while (pos < seqdata.size()) {
558
559 if (IsCanceled()) {
560 NCBI_THROW(
561 CObjWriterException,
562 eInterrupted,
563 "Processing terminated by user");
564 }
565
566 m_Os << seqdata.substr(pos, m_Width) << "\n";
567 pos += m_Width;
568 }
569 }
570
571 // -----------------------------------------------------------------------------
572
GetBestId(const CSeq_id & id)573 string CAlnWriter::GetBestId(const CSeq_id& id)
574 {
575 string best_id;
576 CGenbankIdResolve::Get().GetBestId(
577 CSeq_id_Handle::GetHandle(id),
578 *m_pScope,
579 best_id);
580 return best_id;
581 }
582
583 // -----------------------------------------------------------------------------
584
585 END_NCBI_SCOPE
586
587