1 /*  $Id: bed_writer.cpp 632624 2021-06-03 17:38:23Z ivanov $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors:  Frank Ludwig
27  *
28  * File Description:  Write bed file
29  *
30  */
31 
32 #include <ncbi_pch.hpp>
33 
34 #include <objects/seq/Seq_annot.hpp>
35 #include <objects/seqloc/Seq_interval.hpp>
36 #include <objects/seq/Annot_descr.hpp>
37 #include <objects/seq/Annotdesc.hpp>
38 #include <objects/general/Object_id.hpp>
39 #include <objects/general/User_object.hpp>
40 #include <objects/general/User_field.hpp>
41 #include <objects/seqfeat/Feat_id.hpp>
42 #include <objects/seqfeat/SeqFeatXref.hpp>
43 
44 #include <objmgr/scope.hpp>
45 #include <objmgr/annot_ci.hpp>
46 #include <objmgr/feat_ci.hpp>
47 #include <objmgr/mapped_feat.hpp>
48 
49 #include <objtools/writers/writer_exception.hpp>
50 #include <objtools/writers/write_util.hpp>
51 #include <objtools/writers/bed_track_record.hpp>
52 #include <objtools/writers/bed_feature_record.hpp>
53 #include <objtools/writers/bed_writer.hpp>
54 
55 BEGIN_NCBI_SCOPE
56 USING_SCOPE(objects);
57 
58 //  ----------------------------------------------------------------------------
59 class CThreeFeatRecord
60 //  ----------------------------------------------------------------------------
61 {
62     friend class CThreeFeatManager;
63 
64 public:
CThreeFeatRecord()65     CThreeFeatRecord() {};
66     CThreeFeatRecord(
67         const CSeq_feat&);
~CThreeFeatRecord()68     ~CThreeFeatRecord() {};
69 
70     bool AddFeature(
71         const CSeq_feat&);
72 
73     bool IsRecordComplete() const;
74 
75     bool
76     GetBedFeature(
77         CBedFeatureRecord&) const;
78 
79 private:
80     bool xAddFound(
81         int);
82     bool xAddAll(
83         int);
84 
85     CRef<CSeq_feat> mpChrom;
86     CRef<CSeq_feat> mpThick;
87     CRef<CSeq_feat> mpBlocks;
88     vector<int> mFeatsAll;
89     vector<int> mFeatsFound;
90 };
91 
92 //  ----------------------------------------------------------------------------
CThreeFeatRecord(const CSeq_feat & feat)93 CThreeFeatRecord::CThreeFeatRecord(
94     const CSeq_feat& feat)
95 //  ----------------------------------------------------------------------------
96 {
97     AddFeature(feat);
98 }
99 
100 //  ----------------------------------------------------------------------------
AddFeature(const CSeq_feat & feature)101 bool CThreeFeatRecord::AddFeature(
102     const CSeq_feat& feature)
103 //  ----------------------------------------------------------------------------
104 {
105     string threeFeatType;
106     if (!feature.IsSetId()  ||  !feature.GetId().IsLocal()
107             ||  !feature.GetId().GetLocal().IsId()) {
108         return false;
109     }
110     if (!CWriteUtil::GetThreeFeatType(feature, threeFeatType)) {
111         return false;
112     }
113     bool assigned = false;
114     if (threeFeatType == "chrom") {
115         mpChrom.Reset(new CSeq_feat);
116         mpChrom->Assign(feature);
117         assigned = true;
118     }
119     if (threeFeatType == "thick") {
120         mpThick.Reset(new CSeq_feat);
121         mpThick->Assign(feature);
122         assigned = true;
123     }
124     if (threeFeatType == "block") {
125         mpBlocks.Reset(new CSeq_feat);
126         mpBlocks->Assign(feature);
127         assigned = true;
128     }
129     if (!assigned) {
130         return false;
131     }
132     int featId = feature.GetId().GetLocal().GetId();
133     xAddFound(featId);
134     if (!feature.IsSetXref()) {
135         return true;
136     }
137     for (CRef<CSeqFeatXref> pXref:  feature.GetXref()) {
138         if (!pXref->IsSetId()  ||  !pXref->GetId().IsLocal()  ||
139                 !pXref->GetId().GetLocal().IsId()) {
140             continue;
141         }
142         int featId = pXref->GetId().GetLocal().GetId();
143         xAddAll(featId);
144     }
145     return true;
146 };
147 
148 //  ----------------------------------------------------------------------------
IsRecordComplete() const149 bool CThreeFeatRecord::IsRecordComplete() const
150 //  ----------------------------------------------------------------------------
151 {
152     return (mFeatsFound.size()  ==  mFeatsAll.size());
153 }
154 
155 //  ----------------------------------------------------------------------------
156 bool
GetBedFeature(CBedFeatureRecord & bedRecord) const157 CThreeFeatRecord::GetBedFeature(
158     CBedFeatureRecord& bedRecord) const
159 //  ----------------------------------------------------------------------------
160 {
161     bedRecord = CBedFeatureRecord();
162     if (!mpChrom) {
163         return false;
164     }
165     if (!bedRecord.SetLocation(mpChrom->GetLocation())) {
166         return false;
167     }
168     if (!bedRecord.SetName(mpChrom->GetData())) {
169         return false;
170     }
171     int score;
172     if (!CWriteUtil::GetThreeFeatScore(*mpChrom, score)) {
173         score = 0;
174     }
175     if (!bedRecord.SetScore(score)) {
176         return false;
177     }
178     if (mpThick) {
179         if (!bedRecord.SetThick(mpThick->GetLocation())) {
180             return false;
181         }
182     }
183     else {
184         if (!bedRecord.SetNoThick(mpChrom->GetLocation())) {
185             return false;
186         }
187     }
188     string color;
189     if (CWriteUtil::GetThreeFeatRgb(*mpChrom, color)) {
190         if (!bedRecord.SetRgb(color)) {
191             return false;
192         }
193     }
194     if (mpBlocks) {
195         if (!bedRecord.SetBlocks(
196                 mpChrom->GetLocation(), mpBlocks->GetLocation())) {
197             return false;
198         }
199     }
200     return true;
201 }
202 
203 //  ----------------------------------------------------------------------------
204 bool
xAddFound(int featId)205 CThreeFeatRecord::xAddFound(
206     int featId)
207 //
208 //  Expectation: featId is not listed as found already
209 //  ----------------------------------------------------------------------------
210 {
211     vector<int>::iterator it = std::find(
212         mFeatsFound.begin(), mFeatsFound.end(), featId);
213     if (it != mFeatsFound.end()) {
214         return false;
215     }
216     mFeatsFound.push_back(featId);
217     return xAddAll(featId);
218 }
219 
220 //  ----------------------------------------------------------------------------
221 bool
xAddAll(int featId)222 CThreeFeatRecord::xAddAll(
223     int featId)
224 //  ----------------------------------------------------------------------------
225 {
226     vector<int>::iterator it = std::find(
227         mFeatsAll.begin(), mFeatsAll.end(), featId);
228     if (it == mFeatsAll.end()) {
229         mFeatsAll.push_back(featId);
230     }
231     return true;
232 }
233 
234 //  ----------------------------------------------------------------------------
235 class CThreeFeatManager
236 //  ----------------------------------------------------------------------------
237 {
238 public:
239     using RECORDS = vector<CThreeFeatRecord>;
240     using RECORD_IT = RECORDS::iterator;
241 
CThreeFeatManager()242     CThreeFeatManager() {};
~CThreeFeatManager()243     ~CThreeFeatManager() {};
244 
245     bool
246     AddFeature(
247         const CSeq_feat&);
248 
249     bool
250     IsRecordComplete(
251         const CSeq_feat&);
252 
253     bool
254     ProcessRecord(
255         const CSeq_feat&,
256         CBedFeatureRecord&);
257 
258     bool
259     GetAnyRecord(
260         CBedFeatureRecord&);
261 
262 private:
263     RECORD_IT
264     xFindExistingRecord(
265         const CSeq_feat&);
266 
267     RECORD_IT
268     xAddRecord(
269         const CSeq_feat&);
270 
271     RECORDS mRecords;
272 };
273 
274 //  ----------------------------------------------------------------------------
AddFeature(const CSeq_feat & feature)275 bool CThreeFeatManager::AddFeature(
276     const CSeq_feat& feature)
277 //  ----------------------------------------------------------------------------
278 {
279     RECORD_IT it = xFindExistingRecord(feature);
280     if (it == mRecords.end()) {
281         RECORD_IT addIt = xAddRecord(feature);
282         return (addIt != mRecords.end());
283     }
284     else {
285         return it->AddFeature(feature);
286     }
287 }
288 
289 //  ----------------------------------------------------------------------------
IsRecordComplete(const CSeq_feat & feature)290 bool CThreeFeatManager::IsRecordComplete(
291     const CSeq_feat& feature)
292 //  ----------------------------------------------------------------------------
293 {
294     RECORD_IT it = xFindExistingRecord(feature);
295     if (it == mRecords.end()) {
296         return false;
297     }
298     return (it->IsRecordComplete());
299 }
300 
301 //  ----------------------------------------------------------------------------
302 bool
ProcessRecord(const CSeq_feat & feature,CBedFeatureRecord & bedRecord)303 CThreeFeatManager::ProcessRecord(
304     const CSeq_feat& feature,
305     CBedFeatureRecord& bedRecord)
306 //  ----------------------------------------------------------------------------
307 {
308     RECORD_IT it = xFindExistingRecord(feature);
309     if (it == mRecords.end()) {
310         return false;
311     }
312     if (!it->GetBedFeature(bedRecord)) {
313         return false;
314     }
315     mRecords.erase(it);
316     return true;
317 }
318 
319 //  ----------------------------------------------------------------------------
320 bool
GetAnyRecord(CBedFeatureRecord & bedRecord)321 CThreeFeatManager::GetAnyRecord(
322     CBedFeatureRecord& bedRecord)
323 //  ----------------------------------------------------------------------------
324 {
325     if (mRecords.empty()) {
326         return false;
327     }
328     RECORD_IT it = mRecords.end() - 1;
329     if (!it->GetBedFeature(bedRecord)) {
330         return false;
331     }
332     mRecords.erase(it);
333     return true;
334 }
335 
336 //  ----------------------------------------------------------------------------
337 CThreeFeatManager::RECORD_IT
xFindExistingRecord(const CSeq_feat & feature)338 CThreeFeatManager::xFindExistingRecord(
339     const CSeq_feat& feature)
340 //  ----------------------------------------------------------------------------
341 {
342     if (!feature.IsSetId()  ||  !feature.GetId().IsLocal()
343             ||  !feature.GetId().GetLocal().IsId()) {
344         return mRecords.end();
345     }
346     int featId = feature.GetId().GetLocal().GetId();
347     for (RECORD_IT it = mRecords.begin(); it != mRecords.end(); ++it) {
348         vector<int>::iterator iit = std::find(
349             it->mFeatsAll.begin(), it->mFeatsAll.end(), featId);
350         if (iit != it->mFeatsAll.end()) {
351             return it;
352         }
353     }
354     return mRecords.end();
355 }
356 
357 //  ----------------------------------------------------------------------------
358 CThreeFeatManager::RECORD_IT
xAddRecord(const CSeq_feat & feature)359 CThreeFeatManager::xAddRecord(
360     const CSeq_feat& feature)
361 //  ----------------------------------------------------------------------------
362 {
363     CThreeFeatRecord threeFeatRecord;
364     if (!threeFeatRecord.AddFeature(feature)) {
365         return mRecords.end();
366     }
367     mRecords.push_back(threeFeatRecord);
368     return (mRecords.end() - 1);
369 }
370 
371 //  ----------------------------------------------------------------------------
CBedWriter(CScope & scope,CNcbiOstream & ostr,unsigned int colCount,unsigned int uFlags)372 CBedWriter::CBedWriter(
373     CScope& scope,
374     CNcbiOstream& ostr,
375     unsigned int colCount,
376     unsigned int uFlags ) :
377 //  ----------------------------------------------------------------------------
378     CWriterBase(ostr, uFlags),
379     m_Scope(scope),
380     m_colCount(colCount)
381 {
382     // the first three columns are mandatory
383     if (m_colCount < 3) {
384         m_colCount = 3;
385     }
386 };
387 
388 //  ----------------------------------------------------------------------------
~CBedWriter()389 CBedWriter::~CBedWriter()
390 //  ----------------------------------------------------------------------------
391 {
392 };
393 
394 //  ----------------------------------------------------------------------------
395 
396 //  ----------------------------------------------------------------------------
WriteAnnot(const CSeq_annot & annot,const string &,const string &)397 bool CBedWriter::WriteAnnot(
398     const CSeq_annot& annot,
399     const string&,
400     const string& )
401 //  ----------------------------------------------------------------------------
402 {
403     m_colCount = 6;
404     if( annot.CanGetDesc() ) {
405         ITERATE(CAnnot_descr::Tdata, DescIter, annot.GetDesc().Get()) {
406             const CAnnotdesc& desc = **DescIter;
407             if(desc.IsUser()) {
408                 if(desc.GetUser().HasField("NCBI_BED_COLUMN_COUNT")) {
409                     CConstRef< CUser_field > field = desc.GetUser().GetFieldRef("NCBI_BED_COLUMN_COUNT");
410                     if(field && field->CanGetData() && field->GetData().IsInt()) {
411                         m_colCount = field->GetData().GetInt();
412                     }
413                 }
414             }
415         }
416     }
417 
418     CBedTrackRecord track;
419     if ( ! track.Assign(annot) ) {
420         return false;
421     }
422     track.Write(m_Os);
423 
424     if (CWriteUtil::IsThreeFeatFormat(annot)) {
425         return xWriteAnnotThreeFeatData(track, annot);
426     }
427     else {
428         return xWriteAnnotFeatureTable(track, annot);
429     }
430 }
431 
432 //  ----------------------------------------------------------------------------
WriteSeqEntryHandle(CSeq_entry_Handle seh,const string & strAssemblyName,const string & strAssemblyAccession)433 bool CBedWriter::WriteSeqEntryHandle(
434     CSeq_entry_Handle seh,
435     const string& strAssemblyName,
436     const string& strAssemblyAccession )
437 //  ----------------------------------------------------------------------------
438 {
439     CBedTrackRecord track;
440 
441     SAnnotSelector sel;
442     for (CAnnot_CI aci(seh, sel); aci; ++aci) {
443         auto sah = *aci;
444         if (track.Assign(*sah.GetCompleteSeq_annot()) ) {
445             track.Write(m_Os);
446         }
447 
448         if (!xWriteAnnotFeatureTable(track, sah)) {
449             return false;
450         }
451     }
452     return true;
453 }
454 
455 
456 //  ----------------------------------------------------------------------------
xWriteAnnotFeatureTable(const CBedTrackRecord & track,const CSeq_annot & annot)457 bool CBedWriter::xWriteAnnotFeatureTable(
458     const CBedTrackRecord& track,
459     const CSeq_annot& annot)
460 //  ----------------------------------------------------------------------------
461 {
462     SAnnotSelector sel = SetAnnotSelector();
463     CSeq_annot_Handle sah = m_Scope.AddSeq_annot(annot);
464     auto result = xWriteAnnotFeatureTable(track, sah);
465     m_Scope.RemoveSeq_annot(sah);
466     return result;
467 }
468 
469 
470 //  ----------------------------------------------------------------------------
xWriteAnnotFeatureTable(const CBedTrackRecord & track,const CSeq_annot_Handle & sah)471 bool CBedWriter::xWriteAnnotFeatureTable(
472     const CBedTrackRecord& track,
473     const CSeq_annot_Handle& sah)
474 //  ----------------------------------------------------------------------------
475 {
476     for (CFeat_CI pMf(sah, SAnnotSelector()); pMf; ++pMf ) {
477         if (IsCanceled()) {
478             NCBI_THROW(
479                 CObjWriterException,
480                 eInterrupted,
481                 "Processing terminated by user");
482         }
483         if (!xWriteFeature(track, *pMf)) {
484             return false;
485         }
486     }
487     return true;
488 }
489 
490 
491 //  ----------------------------------------------------------------------------
xWriteAnnotThreeFeatData(const CBedTrackRecord & track,const CSeq_annot & annot)492 bool CBedWriter::xWriteAnnotThreeFeatData(
493     const CBedTrackRecord& track,
494     const CSeq_annot& annot)
495 //  ----------------------------------------------------------------------------
496 {
497     CThreeFeatManager threeFeatManager;
498     CBedFeatureRecord bedRecord;
499 
500     CSeq_annot_Handle sah = m_Scope.AddSeq_annot(annot);
501     auto result = xWriteAnnotThreeFeatData(track, sah);
502     m_Scope.RemoveSeq_annot(sah);
503     return result;
504 }
505 
506 //  ----------------------------------------------------------------------------
xWriteAnnotThreeFeatData(const CBedTrackRecord & track,const CSeq_annot_Handle & sah)507 bool CBedWriter::xWriteAnnotThreeFeatData(
508     const CBedTrackRecord& track,
509     const CSeq_annot_Handle& sah)
510 //  ----------------------------------------------------------------------------
511 {
512     CThreeFeatManager threeFeatManager;
513     CBedFeatureRecord bedRecord;
514 
515     SAnnotSelector sel = SetAnnotSelector();
516     CFeat_CI pMf(sah, sel);
517     for ( ; pMf; ++pMf ) {
518         if (IsCanceled()) {
519             NCBI_THROW(
520                 CObjWriterException,
521                 eInterrupted,
522                 "Processing terminated by user");
523         }
524         const CSeq_feat& feature = pMf->GetOriginalFeature();
525         if (!threeFeatManager.AddFeature(feature)) {
526             break;
527         }
528         if (!threeFeatManager.IsRecordComplete(feature)) {
529             continue;
530         }
531         if (!threeFeatManager.ProcessRecord(feature, bedRecord)) {
532             break;
533         }
534         if (!bedRecord.Write(m_Os, m_colCount)) {
535             break;
536         }
537     }
538     if (!pMf) {
539         while (threeFeatManager.GetAnyRecord(bedRecord)) {
540             continue;
541         }
542     }
543     return (!pMf);
544 }
545 
546 
547 
xWriteFeature(const CMappedFeat & mapped_feat,CSeq_annot_Handle annot_handle,CBioseq_Handle dummy_arg)548 bool CBedWriter::xWriteFeature(
549     const CMappedFeat& mapped_feat,
550     CSeq_annot_Handle annot_handle,
551     CBioseq_Handle dummy_arg)
552 {
553     // Inefficient!
554     // Store track and annot_handle, and only recreate track
555     // if the annot_handle has changed since the last call.
556     CBedTrackRecord track;
557     if ( ! track.Assign(*(annot_handle.GetCompleteSeq_annot())) ) {
558         return false;
559     }
560 
561     return xWriteFeature(track, mapped_feat);
562 }
563 
564 
565 //  ----------------------------------------------------------------------------
xWriteFeature(CFeat_CI feat_it)566 bool CBedWriter::xWriteFeature(
567     CFeat_CI feat_it)
568 //  ----------------------------------------------------------------------------
569 {
570     if (!feat_it) {
571         return false;
572     }
573 
574     const auto& annot_handle = feat_it.GetAnnot();
575 
576     // Inefficient!
577     // Store track and annot_handle, and only recreate track
578     // if the annot_handle has changed since the last call.
579     CBedTrackRecord track;
580     if ( ! track.Assign(*(annot_handle.GetCompleteSeq_annot())) ) {
581         return false;
582     }
583 
584     return xWriteFeature(track, *feat_it);
585 }
586 
587 
588 //  ----------------------------------------------------------------------------
xWriteFeature(const CBedTrackRecord & track,const CMappedFeat & mf)589 bool CBedWriter::xWriteFeature(
590     const CBedTrackRecord& track,
591     const CMappedFeat& mf)
592 //  ----------------------------------------------------------------------------
593 {
594     CBedFeatureRecord record;
595     if (!record.AssignName(mf)) {
596         return false;
597     }
598     if (!record.AssignDisplayData( mf, track.UseScore())) {
599 //          feature did not contain display data ---
600 //          Is there any alternative way to populate some of the bed columns?
601 //          For now, keep going, emit at least the locations ...
602     }
603 
604     CRef<CSeq_loc> pPackedInt(new CSeq_loc(CSeq_loc::e_Mix));
605     pPackedInt->Add(mf.GetLocation());
606     CWriteUtil::ChangeToPackedInt(*pPackedInt);
607 
608     if (!pPackedInt->IsPacked_int() || !pPackedInt->GetPacked_int().CanGet()) {
609         // nothing to do
610         return true;
611     }
612 
613     const list<CRef<CSeq_interval> >& sublocs = pPackedInt->GetPacked_int().Get();
614     list<CRef<CSeq_interval> >::const_iterator it;
615     for (it = sublocs.begin(); it != sublocs.end(); ++it ) {
616         if (!record.AssignLocation(m_Scope, **it)  ||  !record.Write(m_Os, m_colCount)) {
617             return false;
618         }
619     }
620     return true;
621 }
622 
623 
624 END_NCBI_SCOPE
625