1 /*  $Id: bed_feature_record.cpp 632624 2021-06-03 17:38:23Z ivanov $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors:  Frank Ludwig
27  *
28  * File Description:  Write bed file
29  *
30  */
31 
32 #include <ncbi_pch.hpp>
33 
34 #include <objects/seq/Seq_annot.hpp>
35 
36 #include <objects/general/User_object.hpp>
37 #include <objects/general/User_field.hpp>
38 #include <objects/general/Object_id.hpp>
39 #include <objects/seqloc/Seq_interval.hpp>
40 #include <objects/seqloc/Packed_seqint.hpp>
41 
42 #include <objmgr/mapped_feat.hpp>
43 #include <objmgr/util/sequence.hpp>
44 
45 #include <objtools/writers/bed_feature_record.hpp>
46 #include <objtools/writers/write_util.hpp>
47 #include <objtools/writers/genbank_id_resolve.hpp>
48 
49 BEGIN_NCBI_SCOPE
50 USING_SCOPE(objects);
51 
52 #define BUMP( x, y ) if ( x < (y) ) x = (y)
53 
54 //  ----------------------------------------------------------------------------
CBedFeatureRecord()55 CBedFeatureRecord::CBedFeatureRecord():
56 //  ----------------------------------------------------------------------------
57     m_uColumnCount( 0 ),
58     m_strChrom( "." ),
59     m_strChromStart( "." ),
60     m_strChromEnd( "." ),
61     m_strName( "." ),
62     m_strScore( "0" ),
63     m_strStrand( "." ),
64     m_strThickStart( "." ),
65     m_strThickEnd( "." ),
66     m_strItemRgb( "." ),
67     m_strBlockCount( "." ),
68     m_strBlockSizes( "." ),
69     m_strBlockStarts( "." )
70 {
71 }
72 
73 //  ----------------------------------------------------------------------------
~CBedFeatureRecord()74 CBedFeatureRecord::~CBedFeatureRecord()
75 //  ----------------------------------------------------------------------------
76 {
77 }
78 
79 //  ----------------------------------------------------------------------------
80 const CGene_ref&
sGetClosestGeneRef(const CMappedFeat & mf)81 sGetClosestGeneRef(
82     const CMappedFeat& mf)
83 //  ----------------------------------------------------------------------------
84 {
85     static const CGene_ref noRef;
86     if (mf.GetData().IsGene()) {
87         return mf.GetData().GetGene();
88     }
89     if (mf.IsSetXref()) {
90         const auto& xrefs = mf.GetXref();
91         for (auto it = xrefs.begin(); it != xrefs.end(); ++it) {
92             const auto& xref = **it;
93             if (xref.CanGetData() && xref.GetData().IsGene()) {
94                 return xref.GetData().GetGene();
95             }
96         }
97     }
98     CMappedFeat gene = feature::GetBestGeneForFeat(mf);
99     if (gene  &&  gene.IsSetData()  &&  gene.GetData().IsGene()) {
100         return gene.GetData().GetGene();
101     }
102     return noRef;
103 }
104 
105 
106 //  ----------------------------------------------------------------------------
AssignName(const CMappedFeat & mf)107 bool CBedFeatureRecord::AssignName(
108     const CMappedFeat& mf)
109 //  ----------------------------------------------------------------------------
110 {
111     //auto featType = mf.GetFeatSubtype();
112     //if (featType == CSeqFeatData::eSubtype_gene) {
113     //    cerr << "";
114     //}
115     if (mf.GetData().IsRegion()) {
116         m_strName = mf.GetData().GetRegion();
117         std::transform(m_strName.begin(), m_strName.end(), m_strName.begin(),
118             [](char c) -> char { return std::isspace(c) ? '_' : c; });
119         return true;
120     }
121     const auto& geneRef = sGetClosestGeneRef(mf);
122     if (geneRef.IsSetLocus()) {
123         m_strName = geneRef.GetLocus();
124         return true;
125     }
126     if (geneRef.IsSetLocus_tag()) {
127         m_strName = geneRef.GetLocus_tag();
128         return true;
129     }
130 
131     // try harder only if the given feature itself is a gene:
132     if (!mf.GetData().IsGene()) {
133         return true;
134     }
135     if (geneRef.IsSetDesc()) {
136         m_strName = geneRef.GetDesc();
137     }
138     if (geneRef.IsSetSyn()) {
139         m_strName = geneRef.GetSyn().front();
140         return true;
141     }
142     return true;
143 }
144 
145 //  ----------------------------------------------------------------------------
AssignDisplayData(const CMappedFeat & mf,bool bUseScore)146 bool CBedFeatureRecord::AssignDisplayData(
147     const CMappedFeat& mf,
148     bool bUseScore )
149 //  ----------------------------------------------------------------------------
150 {
151     if ( ! mf.GetData().IsUser() ) {
152         return false;
153     }
154     const CUser_object& uo = mf.GetData().GetUser();
155     if ( ! uo.IsSetType() || ! uo.GetType().IsStr() ||
156         uo.GetType().GetStr() != "Display Data" ) {
157         return false;
158     }
159     const vector< CRef< CUser_field > >& fields = uo.GetData();
160     vector< CRef< CUser_field > >::const_iterator it = fields.begin();
161     for ( ; it != fields.end(); ++it ) {
162         if ( ! (*it)->CanGetLabel() || ! (*it)->GetLabel().IsStr() ) {
163             continue;
164         }
165         string strLabel = (*it)->GetLabel().GetStr();
166         if ( strLabel == "name" ) {
167             BUMP( m_uColumnCount, 4 );
168             if ( (*it)->IsSetData() && (*it)->GetData().IsStr() ) {
169                 m_strName = (*it)->GetData().GetStr();
170             }
171             continue;
172         }
173         if ( strLabel == "score" && ! bUseScore ) {
174             BUMP( m_uColumnCount, 5 );
175             if ( (*it)->IsSetData() && (*it)->GetData().IsInt() ) {
176                 m_strScore = NStr::UIntToString( (*it)->GetData().GetInt() );
177             }
178             continue;
179         }
180         if ( strLabel == "greylevel" && bUseScore ) {
181             BUMP( m_uColumnCount, 5 );
182             if ( (*it)->IsSetData() && (*it)->GetData().IsInt() ) {
183                 m_strScore = NStr::UIntToString( (*it)->GetData().GetInt() );
184             }
185             continue;
186         }
187         if ( strLabel == "thickStart" ) {
188             BUMP( m_uColumnCount, 7 );
189             if ( (*it)->IsSetData() && (*it)->GetData().IsInt() ) {
190                 m_strThickStart = NStr::UIntToString( (*it)->GetData().GetInt());
191             }
192             continue;
193         }
194         if ( strLabel == "thickEnd" ) {
195             BUMP( m_uColumnCount, 8 );
196             if ( (*it)->IsSetData() && (*it)->GetData().IsInt() ) {
197                 m_strThickEnd = NStr::UIntToString( (*it)->GetData().GetInt()+1);
198             }
199             continue;
200         }
201         if ( strLabel == "itemRGB" ) {
202             BUMP( m_uColumnCount, 9 );
203             if ( (*it)->IsSetData() && (*it)->GetData().IsInt() ) {
204                 m_strItemRgb = NStr::UIntToString( (*it)->GetData().GetInt() );
205                 continue;
206             }
207             if ( (*it)->IsSetData() && (*it)->GetData().IsStr() ) {
208                 m_strItemRgb = (*it)->GetData().GetStr();
209                 continue;
210             }
211             continue;
212         }
213         if ( strLabel == "blockCount" ) {
214             BUMP( m_uColumnCount, 10 );
215             if ( (*it)->IsSetData() && (*it)->GetData().IsInt() ) {
216                 m_strBlockCount = NStr::UIntToString( (*it)->GetData().GetInt() );
217             }
218             continue;
219         }
220         if ( strLabel == "blockSizes" ) {
221             BUMP( m_uColumnCount, 11 );
222             if ( (*it)->IsSetData() && (*it)->GetData().IsStr() ) {
223                 m_strBlockSizes = (*it)->GetData().GetStr();
224             }
225             continue;
226         }
227         if ( strLabel == "blockStarts" ) {
228             BUMP( m_uColumnCount, 12 );
229             if ( (*it)->IsSetData() && (*it)->GetData().IsStr() ) {
230                 m_strBlockStarts = (*it)->GetData().GetStr();
231             }
232             continue;
233         }
234     }
235     return true;
236 }
237 
238 //  ----------------------------------------------------------------------------
AssignLocation(CScope & scope,const CSeq_interval & interval)239 bool CBedFeatureRecord::AssignLocation(
240     CScope& scope,
241     const CSeq_interval& interval )
242 //  ----------------------------------------------------------------------------
243 {
244     if ( interval.CanGetId() ) {
245         m_strChrom = interval.GetId().GetSeqIdString(true);
246         try {
247             string bestId;
248             CGenbankIdResolve::Get().GetBestId(
249                 CSeq_id_Handle::GetHandle(interval.GetId()), scope, bestId);
250             m_strChrom = bestId;
251             // its OK for this to silently fail.
252             // Seq-ids that dont look up are just meant to be output directly
253         } catch(...) { ; }
254     }
255     if ( interval.IsSetFrom() ) {
256         m_strChromStart = NStr::UIntToString( interval.GetFrom() );
257     }
258     if ( interval.IsSetTo() ) {
259         m_strChromEnd = NStr::UIntToString( interval.GetTo() + 1 );
260     }
261     BUMP( m_uColumnCount, 3 );
262     m_strStrand = "+";
263     if (interval.IsSetStrand()) {
264         ENa_strand strand = interval.GetStrand();
265         if (strand == eNa_strand_minus) {
266             m_strStrand = "-";
267         }
268     }
269     return true;
270 }
271 
272 //  ----------------------------------------------------------------------------
Write(CNcbiOstream & ostr,unsigned int columnCount)273 bool CBedFeatureRecord::Write(
274     CNcbiOstream& ostr,
275     unsigned int columnCount)
276 //  ----------------------------------------------------------------------------
277 {
278     ostr << Chrom();
279     ostr << "\t" << ChromStart();
280     ostr << "\t" << ChromEnd();
281     if ( columnCount >= 4 ) {
282         ostr << "\t" << Name();
283     }
284     if ( columnCount >= 5 ) {
285         ostr << "\t" << Score();
286     }
287     if ( columnCount >= 6 ) {
288         ostr << "\t" << Strand();
289     }
290     if ( columnCount >= 7 ) {
291         ostr << "\t" << ThickStart();
292     }
293     if ( columnCount >= 8 ) {
294         ostr << "\t" << ThickEnd();
295     }
296     if ( columnCount >= 9 ) {
297         ostr << "\t" << ItemRgb();
298     }
299     if ( columnCount >= 10 ) {
300         ostr << "\t" << BlockCount();
301     }
302     if ( columnCount >= 11 ) {
303         ostr << "\t" << BlockSizes();
304     }
305     if ( columnCount >= 12 ) {
306         ostr << "\t" << BlockStarts();
307     }
308     ostr << '\n';
309     return true;
310 }
311 
312 
313 //  ----------------------------------------------------------------------------
SetLocation(const CSeq_loc & loc)314 bool CBedFeatureRecord::SetLocation(
315     const CSeq_loc& loc)
316 //  ----------------------------------------------------------------------------
317 {
318     string bestId;
319     if (!CGenbankIdResolve::Get().GetBestId(loc, bestId)) {
320         return false;
321     }
322     if (loc.IsInt()) {
323         const CSeq_interval& chromInt = loc.GetInt();
324         m_strChrom = bestId;
325         m_strChromStart = NStr::IntToString(chromInt.GetFrom());
326         m_strChromEnd = NStr::IntToString(chromInt.GetTo()+1);
327         m_strStrand = "+";
328         if (chromInt.IsSetStrand()) {
329             ENa_strand strand = chromInt.GetStrand();
330             if (strand == eNa_strand_minus) {
331                 m_strStrand = "-";
332             }
333         }
334         return true;
335     }
336 
337     if (loc.IsPnt()) {
338         const CSeq_point& chromPt = loc.GetPnt();
339         m_strChrom = bestId;
340         m_strChromStart = NStr::IntToString(chromPt.GetPoint());
341         m_strChromEnd = NStr::IntToString(chromPt.GetPoint()+1);
342         m_strStrand = "+";
343         if (chromPt.IsSetStrand()) {
344             ENa_strand strand = chromPt.GetStrand();
345             if (strand == eNa_strand_minus) {
346                 m_strStrand = "-";
347             }
348         }
349         return true;
350     }
351     return false;
352 }
353 
354 //  -----------------------------------------------------------------------------
SetName(const CSeqFeatData & data)355 bool CBedFeatureRecord::SetName(
356     const CSeqFeatData& data)
357 //  -----------------------------------------------------------------------------
358 {
359     if (data.IsRegion()) {
360         m_strName = data.GetRegion();
361         return true;
362     }
363     if (data.IsGene()) {
364     }
365     return false;
366 }
367 
368 //  -----------------------------------------------------------------------------
SetScore(int score)369 bool CBedFeatureRecord::SetScore(
370     int score)
371 //  -----------------------------------------------------------------------------
372 {
373     m_strScore = NStr::IntToString(score);
374     return true;
375 }
376 
377 //  ----------------------------------------------------------------------------
SetThick(const CSeq_loc & loc)378 bool CBedFeatureRecord::SetThick(
379     const CSeq_loc& loc)
380 //  ----------------------------------------------------------------------------
381 {
382     if (loc.IsInt()) {
383         const CSeq_interval& thickInt = loc.GetInt();
384         m_strThickStart = NStr::IntToString(thickInt.GetFrom());
385         m_strThickEnd = NStr::IntToString(thickInt.GetTo()+1);
386         return true;
387     }
388     if (loc.IsPnt()) {
389         const CSeq_point& thickPoint = loc.GetPnt();
390         m_strThickStart = NStr::IntToString(thickPoint.GetPoint());
391         m_strThickEnd = NStr::IntToString(thickPoint.GetPoint()+1);
392         return true;
393     }
394     return false;
395 }
396 
397 //  -----------------------------------------------------------------------------
SetBlocks(const CSeq_loc & chrom,const CSeq_loc & blocks)398 bool CBedFeatureRecord::SetBlocks(
399     const CSeq_loc& chrom,
400     const CSeq_loc& blocks)
401 //  -----------------------------------------------------------------------------
402 {
403     if (blocks.IsInt()) {
404         return true;
405     }
406     if (blocks.IsPacked_int()) {
407         const list<CRef<CSeq_interval> >& intervals = blocks.GetPacked_int().Get();
408         ENa_strand strand = blocks.GetStrand();
409         int offset = chrom.GetStart(eExtreme_Positional);
410         list<string> blockStarts;
411         list<string> blockSizes;
412 
413         for (auto pInterval: intervals) {
414             const CSeq_interval& interval = *pInterval;
415             if (strand == eNa_strand_minus) {
416                 blockStarts.push_front(NStr::NumericToString(interval.GetFrom()-offset));
417                 blockSizes.push_front(NStr::NumericToString(interval.GetLength()));
418             }
419             else {
420                 blockStarts.push_back(NStr::NumericToString(interval.GetFrom()-offset));
421                 blockSizes.push_back(NStr::NumericToString(interval.GetLength()));
422             }
423         }
424         m_strBlockCount  = NStr::NumericToString(intervals.size());
425         m_strBlockStarts = NStr::Join(blockStarts, ",");
426         m_strBlockSizes  = NStr::Join(blockSizes, ",");
427         return true;
428     }
429     return false;
430 }
431 
432 //  -----------------------------------------------------------------------------
SetNoThick(const CSeq_loc & loc)433 bool CBedFeatureRecord::SetNoThick(
434     const CSeq_loc& loc)
435 //  -----------------------------------------------------------------------------
436 {
437     if (loc.IsInt()) {
438         const CSeq_interval& chromInt = loc.GetInt();
439         m_strThickStart = NStr::IntToString(chromInt.GetFrom());
440         m_strThickEnd = NStr::IntToString(chromInt.GetFrom());
441         return true;
442     }
443 
444     if (loc.IsPnt()) {
445         const CSeq_point& chromPt = loc.GetPnt();
446         m_strThickStart = NStr::IntToString(chromPt.GetPoint());
447         m_strThickEnd = NStr::IntToString(chromPt.GetPoint());
448         return true;
449     }
450     return false;
451 }
452 
453 //  -----------------------------------------------------------------------------
SetRgb(const string & color)454 bool CBedFeatureRecord::SetRgb(
455     const string& color)
456 //  -----------------------------------------------------------------------------
457 {
458     if (color == "0 0 0") {
459         m_strItemRgb = "0";
460         return true;
461     }
462     vector<string> rgb;
463     NStr::Split(color, " ", rgb);
464     m_strItemRgb = NStr::Join(rgb, ",");
465     return true;
466 }
467 
468 //  -----------------------------------------------------------------------------
469 
470 END_NCBI_SCOPE
471