1 /*  $Id: aln_writer.cpp 634268 2021-07-07 15:42:01Z ivanov $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors:  Justin Foley
27  *
28  * File Description:  Write alignment
29  *
30  */
31 
32 #include <ncbi_pch.hpp>
33 
34 #include <objects/seqalign/Seq_align.hpp>
35 #include <objects/seqalign/Dense_seg.hpp>
36 #include <objects/seqalign/Spliced_seg.hpp>
37 #include <objects/seqalign/Spliced_exon.hpp>
38 #include <objects/seqalign/Sparse_seg.hpp>
39 #include <objects/seqalign/Sparse_align.hpp>
40 #include <objects/seqalign/Product_pos.hpp>
41 #include <objects/seqalign/Prot_pos.hpp>
42 #include <objects/seqalign/Spliced_exon_chunk.hpp>
43 #include <objects/general/Object_id.hpp>
44 #include <objects/seq/seq_id_handle.hpp>
45 
46 #include <objmgr/scope.hpp>
47 #include <objmgr/bioseq_handle.hpp>
48 #include <objmgr/seq_vector.hpp>
49 #include <objmgr/util/sequence.hpp>
50 
51 #include <objtools/writers/writer_exception.hpp>
52 #include <objtools/writers/write_util.hpp>
53 #include <objtools/writers/aln_writer.hpp>
54 
55 #include <util/sequtil/sequtil_manip.hpp>
56 
57 BEGIN_NCBI_SCOPE
58 USING_SCOPE(objects);
59 
60 //  ----------------------------------------------------------------------------
CAlnWriter(CScope & scope,CNcbiOstream & ostr,unsigned int uFlags)61 CAlnWriter::CAlnWriter(
62     CScope& scope,
63     CNcbiOstream& ostr,
64     unsigned int uFlags) :
65     CWriterBase(ostr, uFlags)
66 {
67     m_pScope.Reset(&scope);
68     m_Width = 60;
69     CGenbankIdResolve::Get().SetLabelType(CSeq_id::eFasta);
70 };
71 
72 
73 //  ----------------------------------------------------------------------------
74 
CAlnWriter(CNcbiOstream & ostr,unsigned int uFlags)75 CAlnWriter::CAlnWriter(
76     CNcbiOstream& ostr,
77     unsigned int uFlags) :
78     CAlnWriter(*(new CScope(*CObjectManager::GetInstance())), ostr, uFlags)
79 {
80 };
81 
82 
83 //  ----------------------------------------------------------------------------
WriteAlign(const CSeq_align & align,const string & name,const string & descr)84 bool CAlnWriter::WriteAlign(
85     const CSeq_align& align,
86     const string& name,
87     const string& descr)
88 {
89 
90     switch (align.GetSegs().Which()) {
91     case CSeq_align::C_Segs::e_Denseg:
92         return WriteAlignDenseSeg(align.GetSegs().GetDenseg());
93     case CSeq_align::C_Segs::e_Spliced:
94         return WriteAlignSplicedSeg(align.GetSegs().GetSpliced());
95     case CSeq_align::C_Segs::e_Sparse:
96         return WriteAlignSparseSeg(align.GetSegs().GetSparse());
97     case CSeq_align::C_Segs::e_Std:
98         break;
99     default:
100         break;
101     }
102 
103     return false;
104 }
105 //  ----------------------------------------------------------------------------
106 
ProcessSeqId(const CSeq_id & id,CBioseq_Handle & bsh,CRange<TSeqPos> & range)107 void CAlnWriter::ProcessSeqId(const CSeq_id& id, CBioseq_Handle& bsh, CRange<TSeqPos>& range)
108 {
109     if (m_pScope) {
110 
111         bsh = m_pScope->GetBioseqHandle(id);
112         range.SetFrom(CRange<TSeqPos>::GetPositionMin());
113         range.SetToOpen(CRange<TSeqPos>::GetPositionMax());
114     }
115 }
116 
117 
118 // -----------------------------------------------------------------------------
119 
GetSeqString(CBioseq_Handle bsh,const CRange<TSeqPos> & range,ENa_strand strand,string & seq)120 void CAlnWriter::GetSeqString(CBioseq_Handle bsh,
121     const CRange<TSeqPos>& range,
122     ENa_strand strand,
123     string& seq)
124 {
125     if (!bsh) {
126         NCBI_THROW(CObjWriterException,
127             eBadInput,
128             "Empty bioseq handle");
129     }
130 
131     CSeqVector seq_vec = bsh.GetSeqVector(CBioseq_Handle::eCoding_Iupac, strand);
132     if (range.IsWhole()) {
133         seq_vec.GetSeqData(0, bsh.GetBioseqLength(), seq);
134     }
135     else {
136         seq_vec.GetSeqData(range.GetFrom(), range.GetTo(), seq);
137     }
138 
139     if (NStr::IsBlank(seq)) {
140         NCBI_THROW(CObjWriterException,
141             eBadInput,
142             "Empty sequence string");
143     }
144 }
145 
146 // -----------------------------------------------------------------------------
147 
WriteAlignDenseSeg(const CDense_seg & denseg)148 bool CAlnWriter::WriteAlignDenseSeg(
149     const CDense_seg& denseg)
150 {
151     if (!denseg.CanGetDim() ||
152         !denseg.CanGetNumseg() ||
153         !denseg.CanGetIds() ||
154         !denseg.CanGetStarts() ||
155         !denseg.CanGetLens())
156     {
157         return false;
158     }
159 
160     const auto num_rows = denseg.GetDim();
161     const auto num_segs = denseg.GetNumseg();
162 
163 
164     for (int row=0; row<num_rows; ++row)
165     {
166         const CSeq_id& id = denseg.GetSeq_id(row);
167         CRange<TSeqPos> range;
168 
169         CBioseq_Handle bsh;
170         ProcessSeqId(id, bsh, range);
171         if (!bsh) {
172             NCBI_THROW(CObjWriterException,
173                 eBadInput,
174                 "Unable to fetch Bioseq");
175         }
176 
177         string seq_plus;
178         GetSeqString(bsh, range, eNa_strand_plus, seq_plus);
179 
180         const CSeqUtil::ECoding coding =
181             (bsh.IsNucleotide()) ?
182             CSeqUtil::e_Iupacna :
183             CSeqUtil::e_Iupacaa;
184 
185         string seqdata;
186         for (int seg=0; seg<num_segs; ++seg)
187         {
188             const auto start = denseg.GetStarts()[seg*num_rows + row];
189             const auto len   = denseg.GetLens()[seg];
190             const ENa_strand strand = (denseg.IsSetStrands()) ?
191                 denseg.GetStrands()[seg*num_rows + row] :
192                 eNa_strand_plus;
193             seqdata += GetSegString(seq_plus, coding, strand, start, len);
194         }
195 
196         string best_id = GetBestId(id);
197         string defline = ">" + best_id;
198         WriteContiguous(defline, seqdata);
199     }
200 
201     return true;
202 }
203 
204 // -----------------------------------------------------------------------------
205 
WriteAlignSplicedSeg(const CSpliced_seg & spliced_seg)206 bool CAlnWriter::WriteAlignSplicedSeg(
207     const CSpliced_seg& spliced_seg)
208 {
209     if (!spliced_seg.IsSetExons()) {
210         return false;
211     }
212 
213     CRef<CSeq_id> genomic_id;
214     if (spliced_seg.IsSetGenomic_id()) {
215         genomic_id = Ref(new CSeq_id());
216         genomic_id->Assign(spliced_seg.GetGenomic_id());
217     }
218 
219 
220     CRef<CSeq_id> product_id;
221     if (spliced_seg.IsSetGenomic_id()) {
222         product_id = Ref(new CSeq_id());
223         product_id->Assign(spliced_seg.GetProduct_id());
224     }
225 
226 
227     ENa_strand genomic_strand =
228         spliced_seg.IsSetGenomic_strand() ?
229         spliced_seg.GetGenomic_strand() :
230         eNa_strand_plus;
231 
232 
233     ENa_strand product_strand =
234         spliced_seg.IsSetProduct_strand() ?
235         spliced_seg.GetProduct_strand() :
236         eNa_strand_plus;
237 
238     return WriteSplicedExons(spliced_seg.GetExons(),
239                               spliced_seg.GetProduct_type(),
240                               genomic_id,
241                               genomic_strand,
242                               product_id,
243                               product_strand);
244 }
245 
s_ProductLength(const CProduct_pos & start,const CProduct_pos & end)246 unsigned int s_ProductLength(const CProduct_pos& start, const CProduct_pos& end)
247 {
248     if (start.Which() != end.Which()) {
249         NCBI_THROW(CObjWriterException,
250             eBadInput,
251             "Unable to determine product length");
252     }
253 
254     if (start.Which() == CProduct_pos::e_not_set) {
255         NCBI_THROW(CObjWriterException,
256             eBadInput,
257             "Unable to determine product length");
258     }
259 
260     const int length = end.AsSeqPos() - start.AsSeqPos();
261 
262     return (length >= 0) ? length : -length;
263 }
264 
265 
266 // -----------------------------------------------------------------------------
WriteSplicedExons(const list<CRef<CSpliced_exon>> & exons,CSpliced_seg::TProduct_type product_type,CRef<CSeq_id> default_genomic_id,ENa_strand default_genomic_strand,CRef<CSeq_id> default_product_id,ENa_strand default_product_strand)267 bool CAlnWriter::WriteSplicedExons(const list<CRef<CSpliced_exon>>& exons,
268     CSpliced_seg::TProduct_type product_type,
269     CRef<CSeq_id> default_genomic_id, // May be NULL
270     ENa_strand default_genomic_strand,
271     CRef<CSeq_id> default_product_id,
272     ENa_strand default_product_strand)
273 {
274     string prev_genomic_id;
275     string prev_product_id;
276     for (const CRef<CSpliced_exon>& exon : exons) {
277 
278         const CSeq_id& genomic_id =
279             exon->IsSetGenomic_id() ?
280             exon->GetGenomic_id() :
281             *default_genomic_id;
282 
283         const CSeq_id& product_id =
284             exon->IsSetProduct_id() ?
285             exon->GetProduct_id() :
286             *default_product_id;
287 
288         // Should check to see that the ids are not empty
289         const ENa_strand genomic_strand =
290             exon->IsSetGenomic_strand() ?
291             exon->GetGenomic_strand() :
292             default_genomic_strand;
293 
294         const ENa_strand product_strand =
295             exon->IsSetProduct_strand() ?
296             exon->GetProduct_strand() :
297             default_product_strand;
298 
299         const auto genomic_start = exon->GetGenomic_start();
300         const auto genomic_end = exon->GetGenomic_end();
301 
302         if (genomic_end < genomic_start) {
303             NCBI_THROW(CObjWriterException,
304                 eBadInput,
305                 "Bad genomic location: end < start");
306         }
307         const int genomic_length = genomic_end - genomic_start;
308 
309         const int product_start = exon->GetProduct_start().AsSeqPos();
310         const int product_end = exon->GetProduct_end().AsSeqPos();
311 
312 
313         if (product_end < product_start) {
314             NCBI_THROW(CObjWriterException,
315                 eBadInput,
316                 "Bad product location: end < start");
317         }
318         // product_length is now given in nucleotide units
319         const int product_length = product_end - product_start;
320 
321         CBioseq_Handle genomic_bsh;
322         if (m_pScope) {
323             genomic_bsh = m_pScope->GetBioseqHandle(genomic_id);
324         }
325         if (!genomic_bsh) {
326             NCBI_THROW(CObjWriterException,
327                 eBadInput,
328                 "Unable to resolve genomic sequence");
329         }
330         CRange<TSeqPos> genomic_range(genomic_start, genomic_end+1);
331         string genomic_seq;
332         GetSeqString(genomic_bsh, genomic_range, genomic_strand, genomic_seq);
333 
334         CBioseq_Handle product_bsh;
335         if (m_pScope) {
336             product_bsh = m_pScope->GetBioseqHandle(product_id);
337         }
338         if (!product_bsh) {
339             NCBI_THROW(CObjWriterException,
340                 eBadInput,
341                 "Unable to resolve product sequence");
342         }
343         CRange<TSeqPos> product_range(product_start, product_end+1);
344         string product_seq;
345         GetSeqString(product_bsh, product_range, product_strand, product_seq);
346 
347         if (exon->IsSetParts()) {
348             AddGaps(product_type, exon->GetParts(), genomic_seq, product_seq);
349         }
350         else
351         if (product_length != genomic_length) {
352             NCBI_THROW(CObjWriterException,
353                 eBadInput,
354                 "Lengths of genomic and product sequences don't match");
355         }
356 
357         WriteContiguous(">" + GetBestId(genomic_id), genomic_seq);
358         WriteContiguous(">" + GetBestId(product_id), product_seq);
359     }
360 
361     return true;
362 }
363 
364 // -----------------------------------------------------------------------------
365 
AddGaps(CSpliced_seg::TProduct_type product_type,const CSpliced_exon::TParts & exon_chunks,string & genomic_seq,string & product_seq)366 void CAlnWriter::AddGaps(
367         CSpliced_seg::TProduct_type product_type,
368         const CSpliced_exon::TParts& exon_chunks,
369         string& genomic_seq,
370         string& product_seq)
371 {
372 
373     if (exon_chunks.empty()) {
374         return;
375     }
376 
377     string genomic_string;
378     string product_string;
379 
380     const unsigned int res_width =
381         (product_type == CSpliced_seg::eProduct_type_transcript) ?
382         1 : 3;
383 
384 
385     // Check that match + mismatch + diag + genomic_ins = genomic_length
386     int genomic_pos = 0;
387     int product_pos = 0;
388     unsigned int interval_width = 0;
389     for (CRef<CSpliced_exon_chunk> exon_chunk : exon_chunks) {
390         auto chunk_type = exon_chunk->Which();
391         switch(chunk_type) {
392         case CSpliced_exon_chunk::e_Match:
393         case CSpliced_exon_chunk::e_Mismatch:
394         case CSpliced_exon_chunk::e_Diag:
395             switch(chunk_type) {
396                 default:
397                     // compiler, shut up!
398                     break;
399                 case CSpliced_exon_chunk::e_Match:
400                     interval_width = exon_chunk->GetMatch();
401                     break;
402                 case CSpliced_exon_chunk::e_Mismatch:
403                     interval_width = exon_chunk->GetMismatch();
404                     break;
405                 case CSpliced_exon_chunk::e_Diag:
406                     interval_width = exon_chunk->GetDiag();
407                     break;
408             }
409             genomic_string.append(genomic_seq, genomic_pos, interval_width);
410             product_string.append(product_seq, product_pos, (interval_width + (res_width-1))/res_width);
411             genomic_pos += interval_width;
412             product_pos += interval_width/res_width;
413             break;
414 
415         case CSpliced_exon_chunk::e_Genomic_ins:
416             interval_width = exon_chunk->GetGenomic_ins();
417             genomic_string.append(genomic_seq, genomic_pos, interval_width);
418             product_string.append(interval_width/res_width, '-');
419             genomic_pos += interval_width;
420             break;
421 
422         case CSpliced_exon_chunk::e_Product_ins:
423             interval_width = exon_chunk->GetProduct_ins();
424             genomic_string.append(interval_width, '-');
425             product_string.append(product_seq, product_pos, interval_width/res_width);
426             product_pos += interval_width/res_width;
427             break;
428         default:
429             break;
430         }
431     }
432     genomic_seq = genomic_string;
433     product_seq = product_string;
434 }
435 
436 
437 // -----------------------------------------------------------------------------
438 
GetSegString(const string & seq_plus,CSeqUtil::ECoding coding,const ENa_strand strand,const int start,const size_t len)439 string CAlnWriter::GetSegString(const string& seq_plus,
440     CSeqUtil::ECoding coding,
441     const ENa_strand strand,
442     const int start,
443     const size_t len)
444 {
445     if (start >= 0) {
446         if (start >= seq_plus.size()) {
447             NCBI_THROW(CObjWriterException,
448                 eBadInput,
449                 "Bad location: impossible start");
450         }
451         if (strand != eNa_strand_minus) {
452             return seq_plus.substr(start, len);
453         }
454         // else
455         string seq_minus;
456         CSeqManip::ReverseComplement(seq_plus, coding, start, len, seq_minus);
457         return seq_minus;
458     }
459 
460     return string(len, '-');
461 }
462 
463 
464 // -----------------------------------------------------------------------------
WriteAlignSparseSeg(const CSparse_seg & sparse_seg)465 bool CAlnWriter::WriteAlignSparseSeg(
466     const CSparse_seg& sparse_seg)
467 {
468     for (CRef<CSparse_align> align : sparse_seg.GetRows())
469     {
470         if (!WriteSparseAlign(*align)) {
471             return false;
472         }
473     }
474     return true;
475 }
476 
477 
478 
479 // -----------------------------------------------------------------------------
480 
WriteSparseAlign(const CSparse_align & sparse_align)481 bool CAlnWriter::WriteSparseAlign(const CSparse_align& sparse_align)
482 {
483     const auto num_segs = sparse_align.GetNumseg();
484 
485     {
486         const CSeq_id& first_id  = sparse_align.GetFirst_id();
487         CBioseq_Handle bsh;
488         CRange<TSeqPos> range;
489         ProcessSeqId(first_id, bsh, range);
490         if (!bsh) {
491             NCBI_THROW(CObjWriterException,
492                 eBadInput,
493                 "Unable to resolve ID " + first_id.AsFastaString());
494         }
495         const CSeqUtil::ECoding coding =
496             (bsh.IsNucleotide())?
497             CSeqUtil::e_Iupacna :
498             CSeqUtil::e_Iupacaa;
499 
500         string seq_plus;
501         GetSeqString(bsh, range, eNa_strand_plus, seq_plus);
502 
503         string seqdata;
504         for (int seg=0; seg<num_segs; ++seg) {
505             const auto start = sparse_align.GetFirst_starts()[seg];
506             const auto len = sparse_align.GetLens()[seg];
507             seqdata += GetSegString(seq_plus, coding, eNa_strand_plus, start, len);
508         }
509         WriteContiguous(">" + GetBestId(first_id), seqdata);
510     }
511 
512     { // Second row
513         const CSeq_id& second_id  = sparse_align.GetSecond_id();
514         CBioseq_Handle bsh;
515         CRange<TSeqPos> range;
516         ProcessSeqId(second_id, bsh, range);
517         if (!bsh) {
518             NCBI_THROW(CObjWriterException,
519                 eBadInput,
520                 "Unable to resolve ID " + second_id.AsFastaString());
521         }
522         const CSeqUtil::ECoding coding =
523             (bsh.IsNucleotide())?
524             CSeqUtil::e_Iupacna :
525             CSeqUtil::e_Iupacaa;
526 
527         string seq_plus;
528         GetSeqString(bsh, range, eNa_strand_plus, seq_plus);
529 
530         string seqdata;
531         const vector<ENa_strand>& strands = sparse_align.IsSetSecond_strands() ?
532             sparse_align.GetSecond_strands() : vector<ENa_strand>(num_segs, eNa_strand_plus);
533         for (int seg=0; seg<num_segs; ++seg) {
534             const auto start = sparse_align.GetFirst_starts()[seg];
535             const auto len = sparse_align.GetLens()[seg];
536             seqdata += GetSegString(seq_plus, coding, strands[seg], start, len);
537         }
538 
539         WriteContiguous(">" + GetBestId(second_id), seqdata);
540     }
541     return true;
542 }
543 
544 // -----------------------------------------------------------------------------
545 
WriteContiguous(const string & defline,const string & seqdata)546 void CAlnWriter::WriteContiguous(const string& defline, const string& seqdata)
547 {
548     if (defline.back() == '|' && defline.size() > 1)
549     {
550         const auto length = defline.size();
551         m_Os << defline.substr(0,length-1) << "\n";
552     }
553     else {
554         m_Os << defline << "\n";
555     }
556     size_t pos=0;
557     while (pos < seqdata.size()) {
558 
559         if (IsCanceled()) {
560             NCBI_THROW(
561                 CObjWriterException,
562                 eInterrupted,
563                 "Processing terminated by user");
564         }
565 
566         m_Os << seqdata.substr(pos, m_Width) << "\n";
567         pos += m_Width;
568     }
569 }
570 
571 // -----------------------------------------------------------------------------
572 
GetBestId(const CSeq_id & id)573 string CAlnWriter::GetBestId(const CSeq_id& id)
574 {
575     string best_id;
576     CGenbankIdResolve::Get().GetBestId(
577         CSeq_id_Handle::GetHandle(id),
578         *m_pScope,
579         best_id);
580     return best_id;
581 }
582 
583 // -----------------------------------------------------------------------------
584 
585 END_NCBI_SCOPE
586 
587