1 /* $Id: bed_writer.cpp 632624 2021-06-03 17:38:23Z ivanov $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Authors: Frank Ludwig
27 *
28 * File Description: Write bed file
29 *
30 */
31
32 #include <ncbi_pch.hpp>
33
34 #include <objects/seq/Seq_annot.hpp>
35 #include <objects/seqloc/Seq_interval.hpp>
36 #include <objects/seq/Annot_descr.hpp>
37 #include <objects/seq/Annotdesc.hpp>
38 #include <objects/general/Object_id.hpp>
39 #include <objects/general/User_object.hpp>
40 #include <objects/general/User_field.hpp>
41 #include <objects/seqfeat/Feat_id.hpp>
42 #include <objects/seqfeat/SeqFeatXref.hpp>
43
44 #include <objmgr/scope.hpp>
45 #include <objmgr/annot_ci.hpp>
46 #include <objmgr/feat_ci.hpp>
47 #include <objmgr/mapped_feat.hpp>
48
49 #include <objtools/writers/writer_exception.hpp>
50 #include <objtools/writers/write_util.hpp>
51 #include <objtools/writers/bed_track_record.hpp>
52 #include <objtools/writers/bed_feature_record.hpp>
53 #include <objtools/writers/bed_writer.hpp>
54
55 BEGIN_NCBI_SCOPE
56 USING_SCOPE(objects);
57
58 // ----------------------------------------------------------------------------
59 class CThreeFeatRecord
60 // ----------------------------------------------------------------------------
61 {
62 friend class CThreeFeatManager;
63
64 public:
CThreeFeatRecord()65 CThreeFeatRecord() {};
66 CThreeFeatRecord(
67 const CSeq_feat&);
~CThreeFeatRecord()68 ~CThreeFeatRecord() {};
69
70 bool AddFeature(
71 const CSeq_feat&);
72
73 bool IsRecordComplete() const;
74
75 bool
76 GetBedFeature(
77 CBedFeatureRecord&) const;
78
79 private:
80 bool xAddFound(
81 int);
82 bool xAddAll(
83 int);
84
85 CRef<CSeq_feat> mpChrom;
86 CRef<CSeq_feat> mpThick;
87 CRef<CSeq_feat> mpBlocks;
88 vector<int> mFeatsAll;
89 vector<int> mFeatsFound;
90 };
91
92 // ----------------------------------------------------------------------------
CThreeFeatRecord(const CSeq_feat & feat)93 CThreeFeatRecord::CThreeFeatRecord(
94 const CSeq_feat& feat)
95 // ----------------------------------------------------------------------------
96 {
97 AddFeature(feat);
98 }
99
100 // ----------------------------------------------------------------------------
AddFeature(const CSeq_feat & feature)101 bool CThreeFeatRecord::AddFeature(
102 const CSeq_feat& feature)
103 // ----------------------------------------------------------------------------
104 {
105 string threeFeatType;
106 if (!feature.IsSetId() || !feature.GetId().IsLocal()
107 || !feature.GetId().GetLocal().IsId()) {
108 return false;
109 }
110 if (!CWriteUtil::GetThreeFeatType(feature, threeFeatType)) {
111 return false;
112 }
113 bool assigned = false;
114 if (threeFeatType == "chrom") {
115 mpChrom.Reset(new CSeq_feat);
116 mpChrom->Assign(feature);
117 assigned = true;
118 }
119 if (threeFeatType == "thick") {
120 mpThick.Reset(new CSeq_feat);
121 mpThick->Assign(feature);
122 assigned = true;
123 }
124 if (threeFeatType == "block") {
125 mpBlocks.Reset(new CSeq_feat);
126 mpBlocks->Assign(feature);
127 assigned = true;
128 }
129 if (!assigned) {
130 return false;
131 }
132 int featId = feature.GetId().GetLocal().GetId();
133 xAddFound(featId);
134 if (!feature.IsSetXref()) {
135 return true;
136 }
137 for (CRef<CSeqFeatXref> pXref: feature.GetXref()) {
138 if (!pXref->IsSetId() || !pXref->GetId().IsLocal() ||
139 !pXref->GetId().GetLocal().IsId()) {
140 continue;
141 }
142 int featId = pXref->GetId().GetLocal().GetId();
143 xAddAll(featId);
144 }
145 return true;
146 };
147
148 // ----------------------------------------------------------------------------
IsRecordComplete() const149 bool CThreeFeatRecord::IsRecordComplete() const
150 // ----------------------------------------------------------------------------
151 {
152 return (mFeatsFound.size() == mFeatsAll.size());
153 }
154
155 // ----------------------------------------------------------------------------
156 bool
GetBedFeature(CBedFeatureRecord & bedRecord) const157 CThreeFeatRecord::GetBedFeature(
158 CBedFeatureRecord& bedRecord) const
159 // ----------------------------------------------------------------------------
160 {
161 bedRecord = CBedFeatureRecord();
162 if (!mpChrom) {
163 return false;
164 }
165 if (!bedRecord.SetLocation(mpChrom->GetLocation())) {
166 return false;
167 }
168 if (!bedRecord.SetName(mpChrom->GetData())) {
169 return false;
170 }
171 int score;
172 if (!CWriteUtil::GetThreeFeatScore(*mpChrom, score)) {
173 score = 0;
174 }
175 if (!bedRecord.SetScore(score)) {
176 return false;
177 }
178 if (mpThick) {
179 if (!bedRecord.SetThick(mpThick->GetLocation())) {
180 return false;
181 }
182 }
183 else {
184 if (!bedRecord.SetNoThick(mpChrom->GetLocation())) {
185 return false;
186 }
187 }
188 string color;
189 if (CWriteUtil::GetThreeFeatRgb(*mpChrom, color)) {
190 if (!bedRecord.SetRgb(color)) {
191 return false;
192 }
193 }
194 if (mpBlocks) {
195 if (!bedRecord.SetBlocks(
196 mpChrom->GetLocation(), mpBlocks->GetLocation())) {
197 return false;
198 }
199 }
200 return true;
201 }
202
203 // ----------------------------------------------------------------------------
204 bool
xAddFound(int featId)205 CThreeFeatRecord::xAddFound(
206 int featId)
207 //
208 // Expectation: featId is not listed as found already
209 // ----------------------------------------------------------------------------
210 {
211 vector<int>::iterator it = std::find(
212 mFeatsFound.begin(), mFeatsFound.end(), featId);
213 if (it != mFeatsFound.end()) {
214 return false;
215 }
216 mFeatsFound.push_back(featId);
217 return xAddAll(featId);
218 }
219
220 // ----------------------------------------------------------------------------
221 bool
xAddAll(int featId)222 CThreeFeatRecord::xAddAll(
223 int featId)
224 // ----------------------------------------------------------------------------
225 {
226 vector<int>::iterator it = std::find(
227 mFeatsAll.begin(), mFeatsAll.end(), featId);
228 if (it == mFeatsAll.end()) {
229 mFeatsAll.push_back(featId);
230 }
231 return true;
232 }
233
234 // ----------------------------------------------------------------------------
235 class CThreeFeatManager
236 // ----------------------------------------------------------------------------
237 {
238 public:
239 using RECORDS = vector<CThreeFeatRecord>;
240 using RECORD_IT = RECORDS::iterator;
241
CThreeFeatManager()242 CThreeFeatManager() {};
~CThreeFeatManager()243 ~CThreeFeatManager() {};
244
245 bool
246 AddFeature(
247 const CSeq_feat&);
248
249 bool
250 IsRecordComplete(
251 const CSeq_feat&);
252
253 bool
254 ProcessRecord(
255 const CSeq_feat&,
256 CBedFeatureRecord&);
257
258 bool
259 GetAnyRecord(
260 CBedFeatureRecord&);
261
262 private:
263 RECORD_IT
264 xFindExistingRecord(
265 const CSeq_feat&);
266
267 RECORD_IT
268 xAddRecord(
269 const CSeq_feat&);
270
271 RECORDS mRecords;
272 };
273
274 // ----------------------------------------------------------------------------
AddFeature(const CSeq_feat & feature)275 bool CThreeFeatManager::AddFeature(
276 const CSeq_feat& feature)
277 // ----------------------------------------------------------------------------
278 {
279 RECORD_IT it = xFindExistingRecord(feature);
280 if (it == mRecords.end()) {
281 RECORD_IT addIt = xAddRecord(feature);
282 return (addIt != mRecords.end());
283 }
284 else {
285 return it->AddFeature(feature);
286 }
287 }
288
289 // ----------------------------------------------------------------------------
IsRecordComplete(const CSeq_feat & feature)290 bool CThreeFeatManager::IsRecordComplete(
291 const CSeq_feat& feature)
292 // ----------------------------------------------------------------------------
293 {
294 RECORD_IT it = xFindExistingRecord(feature);
295 if (it == mRecords.end()) {
296 return false;
297 }
298 return (it->IsRecordComplete());
299 }
300
301 // ----------------------------------------------------------------------------
302 bool
ProcessRecord(const CSeq_feat & feature,CBedFeatureRecord & bedRecord)303 CThreeFeatManager::ProcessRecord(
304 const CSeq_feat& feature,
305 CBedFeatureRecord& bedRecord)
306 // ----------------------------------------------------------------------------
307 {
308 RECORD_IT it = xFindExistingRecord(feature);
309 if (it == mRecords.end()) {
310 return false;
311 }
312 if (!it->GetBedFeature(bedRecord)) {
313 return false;
314 }
315 mRecords.erase(it);
316 return true;
317 }
318
319 // ----------------------------------------------------------------------------
320 bool
GetAnyRecord(CBedFeatureRecord & bedRecord)321 CThreeFeatManager::GetAnyRecord(
322 CBedFeatureRecord& bedRecord)
323 // ----------------------------------------------------------------------------
324 {
325 if (mRecords.empty()) {
326 return false;
327 }
328 RECORD_IT it = mRecords.end() - 1;
329 if (!it->GetBedFeature(bedRecord)) {
330 return false;
331 }
332 mRecords.erase(it);
333 return true;
334 }
335
336 // ----------------------------------------------------------------------------
337 CThreeFeatManager::RECORD_IT
xFindExistingRecord(const CSeq_feat & feature)338 CThreeFeatManager::xFindExistingRecord(
339 const CSeq_feat& feature)
340 // ----------------------------------------------------------------------------
341 {
342 if (!feature.IsSetId() || !feature.GetId().IsLocal()
343 || !feature.GetId().GetLocal().IsId()) {
344 return mRecords.end();
345 }
346 int featId = feature.GetId().GetLocal().GetId();
347 for (RECORD_IT it = mRecords.begin(); it != mRecords.end(); ++it) {
348 vector<int>::iterator iit = std::find(
349 it->mFeatsAll.begin(), it->mFeatsAll.end(), featId);
350 if (iit != it->mFeatsAll.end()) {
351 return it;
352 }
353 }
354 return mRecords.end();
355 }
356
357 // ----------------------------------------------------------------------------
358 CThreeFeatManager::RECORD_IT
xAddRecord(const CSeq_feat & feature)359 CThreeFeatManager::xAddRecord(
360 const CSeq_feat& feature)
361 // ----------------------------------------------------------------------------
362 {
363 CThreeFeatRecord threeFeatRecord;
364 if (!threeFeatRecord.AddFeature(feature)) {
365 return mRecords.end();
366 }
367 mRecords.push_back(threeFeatRecord);
368 return (mRecords.end() - 1);
369 }
370
371 // ----------------------------------------------------------------------------
CBedWriter(CScope & scope,CNcbiOstream & ostr,unsigned int colCount,unsigned int uFlags)372 CBedWriter::CBedWriter(
373 CScope& scope,
374 CNcbiOstream& ostr,
375 unsigned int colCount,
376 unsigned int uFlags ) :
377 // ----------------------------------------------------------------------------
378 CWriterBase(ostr, uFlags),
379 m_Scope(scope),
380 m_colCount(colCount)
381 {
382 // the first three columns are mandatory
383 if (m_colCount < 3) {
384 m_colCount = 3;
385 }
386 };
387
388 // ----------------------------------------------------------------------------
~CBedWriter()389 CBedWriter::~CBedWriter()
390 // ----------------------------------------------------------------------------
391 {
392 };
393
394 // ----------------------------------------------------------------------------
395
396 // ----------------------------------------------------------------------------
WriteAnnot(const CSeq_annot & annot,const string &,const string &)397 bool CBedWriter::WriteAnnot(
398 const CSeq_annot& annot,
399 const string&,
400 const string& )
401 // ----------------------------------------------------------------------------
402 {
403 m_colCount = 6;
404 if( annot.CanGetDesc() ) {
405 ITERATE(CAnnot_descr::Tdata, DescIter, annot.GetDesc().Get()) {
406 const CAnnotdesc& desc = **DescIter;
407 if(desc.IsUser()) {
408 if(desc.GetUser().HasField("NCBI_BED_COLUMN_COUNT")) {
409 CConstRef< CUser_field > field = desc.GetUser().GetFieldRef("NCBI_BED_COLUMN_COUNT");
410 if(field && field->CanGetData() && field->GetData().IsInt()) {
411 m_colCount = field->GetData().GetInt();
412 }
413 }
414 }
415 }
416 }
417
418 CBedTrackRecord track;
419 if ( ! track.Assign(annot) ) {
420 return false;
421 }
422 track.Write(m_Os);
423
424 if (CWriteUtil::IsThreeFeatFormat(annot)) {
425 return xWriteAnnotThreeFeatData(track, annot);
426 }
427 else {
428 return xWriteAnnotFeatureTable(track, annot);
429 }
430 }
431
432 // ----------------------------------------------------------------------------
WriteSeqEntryHandle(CSeq_entry_Handle seh,const string & strAssemblyName,const string & strAssemblyAccession)433 bool CBedWriter::WriteSeqEntryHandle(
434 CSeq_entry_Handle seh,
435 const string& strAssemblyName,
436 const string& strAssemblyAccession )
437 // ----------------------------------------------------------------------------
438 {
439 CBedTrackRecord track;
440
441 SAnnotSelector sel;
442 for (CAnnot_CI aci(seh, sel); aci; ++aci) {
443 auto sah = *aci;
444 if (track.Assign(*sah.GetCompleteSeq_annot()) ) {
445 track.Write(m_Os);
446 }
447
448 if (!xWriteAnnotFeatureTable(track, sah)) {
449 return false;
450 }
451 }
452 return true;
453 }
454
455
456 // ----------------------------------------------------------------------------
xWriteAnnotFeatureTable(const CBedTrackRecord & track,const CSeq_annot & annot)457 bool CBedWriter::xWriteAnnotFeatureTable(
458 const CBedTrackRecord& track,
459 const CSeq_annot& annot)
460 // ----------------------------------------------------------------------------
461 {
462 SAnnotSelector sel = SetAnnotSelector();
463 CSeq_annot_Handle sah = m_Scope.AddSeq_annot(annot);
464 auto result = xWriteAnnotFeatureTable(track, sah);
465 m_Scope.RemoveSeq_annot(sah);
466 return result;
467 }
468
469
470 // ----------------------------------------------------------------------------
xWriteAnnotFeatureTable(const CBedTrackRecord & track,const CSeq_annot_Handle & sah)471 bool CBedWriter::xWriteAnnotFeatureTable(
472 const CBedTrackRecord& track,
473 const CSeq_annot_Handle& sah)
474 // ----------------------------------------------------------------------------
475 {
476 for (CFeat_CI pMf(sah, SAnnotSelector()); pMf; ++pMf ) {
477 if (IsCanceled()) {
478 NCBI_THROW(
479 CObjWriterException,
480 eInterrupted,
481 "Processing terminated by user");
482 }
483 if (!xWriteFeature(track, *pMf)) {
484 return false;
485 }
486 }
487 return true;
488 }
489
490
491 // ----------------------------------------------------------------------------
xWriteAnnotThreeFeatData(const CBedTrackRecord & track,const CSeq_annot & annot)492 bool CBedWriter::xWriteAnnotThreeFeatData(
493 const CBedTrackRecord& track,
494 const CSeq_annot& annot)
495 // ----------------------------------------------------------------------------
496 {
497 CThreeFeatManager threeFeatManager;
498 CBedFeatureRecord bedRecord;
499
500 CSeq_annot_Handle sah = m_Scope.AddSeq_annot(annot);
501 auto result = xWriteAnnotThreeFeatData(track, sah);
502 m_Scope.RemoveSeq_annot(sah);
503 return result;
504 }
505
506 // ----------------------------------------------------------------------------
xWriteAnnotThreeFeatData(const CBedTrackRecord & track,const CSeq_annot_Handle & sah)507 bool CBedWriter::xWriteAnnotThreeFeatData(
508 const CBedTrackRecord& track,
509 const CSeq_annot_Handle& sah)
510 // ----------------------------------------------------------------------------
511 {
512 CThreeFeatManager threeFeatManager;
513 CBedFeatureRecord bedRecord;
514
515 SAnnotSelector sel = SetAnnotSelector();
516 CFeat_CI pMf(sah, sel);
517 for ( ; pMf; ++pMf ) {
518 if (IsCanceled()) {
519 NCBI_THROW(
520 CObjWriterException,
521 eInterrupted,
522 "Processing terminated by user");
523 }
524 const CSeq_feat& feature = pMf->GetOriginalFeature();
525 if (!threeFeatManager.AddFeature(feature)) {
526 break;
527 }
528 if (!threeFeatManager.IsRecordComplete(feature)) {
529 continue;
530 }
531 if (!threeFeatManager.ProcessRecord(feature, bedRecord)) {
532 break;
533 }
534 if (!bedRecord.Write(m_Os, m_colCount)) {
535 break;
536 }
537 }
538 if (!pMf) {
539 while (threeFeatManager.GetAnyRecord(bedRecord)) {
540 continue;
541 }
542 }
543 return (!pMf);
544 }
545
546
547
xWriteFeature(const CMappedFeat & mapped_feat,CSeq_annot_Handle annot_handle,CBioseq_Handle dummy_arg)548 bool CBedWriter::xWriteFeature(
549 const CMappedFeat& mapped_feat,
550 CSeq_annot_Handle annot_handle,
551 CBioseq_Handle dummy_arg)
552 {
553 // Inefficient!
554 // Store track and annot_handle, and only recreate track
555 // if the annot_handle has changed since the last call.
556 CBedTrackRecord track;
557 if ( ! track.Assign(*(annot_handle.GetCompleteSeq_annot())) ) {
558 return false;
559 }
560
561 return xWriteFeature(track, mapped_feat);
562 }
563
564
565 // ----------------------------------------------------------------------------
xWriteFeature(CFeat_CI feat_it)566 bool CBedWriter::xWriteFeature(
567 CFeat_CI feat_it)
568 // ----------------------------------------------------------------------------
569 {
570 if (!feat_it) {
571 return false;
572 }
573
574 const auto& annot_handle = feat_it.GetAnnot();
575
576 // Inefficient!
577 // Store track and annot_handle, and only recreate track
578 // if the annot_handle has changed since the last call.
579 CBedTrackRecord track;
580 if ( ! track.Assign(*(annot_handle.GetCompleteSeq_annot())) ) {
581 return false;
582 }
583
584 return xWriteFeature(track, *feat_it);
585 }
586
587
588 // ----------------------------------------------------------------------------
xWriteFeature(const CBedTrackRecord & track,const CMappedFeat & mf)589 bool CBedWriter::xWriteFeature(
590 const CBedTrackRecord& track,
591 const CMappedFeat& mf)
592 // ----------------------------------------------------------------------------
593 {
594 CBedFeatureRecord record;
595 if (!record.AssignName(mf)) {
596 return false;
597 }
598 if (!record.AssignDisplayData( mf, track.UseScore())) {
599 // feature did not contain display data ---
600 // Is there any alternative way to populate some of the bed columns?
601 // For now, keep going, emit at least the locations ...
602 }
603
604 CRef<CSeq_loc> pPackedInt(new CSeq_loc(CSeq_loc::e_Mix));
605 pPackedInt->Add(mf.GetLocation());
606 CWriteUtil::ChangeToPackedInt(*pPackedInt);
607
608 if (!pPackedInt->IsPacked_int() || !pPackedInt->GetPacked_int().CanGet()) {
609 // nothing to do
610 return true;
611 }
612
613 const list<CRef<CSeq_interval> >& sublocs = pPackedInt->GetPacked_int().Get();
614 list<CRef<CSeq_interval> >::const_iterator it;
615 for (it = sublocs.begin(); it != sublocs.end(); ++it ) {
616 if (!record.AssignLocation(m_Scope, **it) || !record.Write(m_Os, m_colCount)) {
617 return false;
618 }
619 }
620 return true;
621 }
622
623
624 END_NCBI_SCOPE
625