1 /* $Id: bed_feature_record.cpp 632624 2021-06-03 17:38:23Z ivanov $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Authors: Frank Ludwig
27 *
28 * File Description: Write bed file
29 *
30 */
31
32 #include <ncbi_pch.hpp>
33
34 #include <objects/seq/Seq_annot.hpp>
35
36 #include <objects/general/User_object.hpp>
37 #include <objects/general/User_field.hpp>
38 #include <objects/general/Object_id.hpp>
39 #include <objects/seqloc/Seq_interval.hpp>
40 #include <objects/seqloc/Packed_seqint.hpp>
41
42 #include <objmgr/mapped_feat.hpp>
43 #include <objmgr/util/sequence.hpp>
44
45 #include <objtools/writers/bed_feature_record.hpp>
46 #include <objtools/writers/write_util.hpp>
47 #include <objtools/writers/genbank_id_resolve.hpp>
48
49 BEGIN_NCBI_SCOPE
50 USING_SCOPE(objects);
51
52 #define BUMP( x, y ) if ( x < (y) ) x = (y)
53
54 // ----------------------------------------------------------------------------
CBedFeatureRecord()55 CBedFeatureRecord::CBedFeatureRecord():
56 // ----------------------------------------------------------------------------
57 m_uColumnCount( 0 ),
58 m_strChrom( "." ),
59 m_strChromStart( "." ),
60 m_strChromEnd( "." ),
61 m_strName( "." ),
62 m_strScore( "0" ),
63 m_strStrand( "." ),
64 m_strThickStart( "." ),
65 m_strThickEnd( "." ),
66 m_strItemRgb( "." ),
67 m_strBlockCount( "." ),
68 m_strBlockSizes( "." ),
69 m_strBlockStarts( "." )
70 {
71 }
72
73 // ----------------------------------------------------------------------------
~CBedFeatureRecord()74 CBedFeatureRecord::~CBedFeatureRecord()
75 // ----------------------------------------------------------------------------
76 {
77 }
78
79 // ----------------------------------------------------------------------------
80 const CGene_ref&
sGetClosestGeneRef(const CMappedFeat & mf)81 sGetClosestGeneRef(
82 const CMappedFeat& mf)
83 // ----------------------------------------------------------------------------
84 {
85 static const CGene_ref noRef;
86 if (mf.GetData().IsGene()) {
87 return mf.GetData().GetGene();
88 }
89 if (mf.IsSetXref()) {
90 const auto& xrefs = mf.GetXref();
91 for (auto it = xrefs.begin(); it != xrefs.end(); ++it) {
92 const auto& xref = **it;
93 if (xref.CanGetData() && xref.GetData().IsGene()) {
94 return xref.GetData().GetGene();
95 }
96 }
97 }
98 CMappedFeat gene = feature::GetBestGeneForFeat(mf);
99 if (gene && gene.IsSetData() && gene.GetData().IsGene()) {
100 return gene.GetData().GetGene();
101 }
102 return noRef;
103 }
104
105
106 // ----------------------------------------------------------------------------
AssignName(const CMappedFeat & mf)107 bool CBedFeatureRecord::AssignName(
108 const CMappedFeat& mf)
109 // ----------------------------------------------------------------------------
110 {
111 //auto featType = mf.GetFeatSubtype();
112 //if (featType == CSeqFeatData::eSubtype_gene) {
113 // cerr << "";
114 //}
115 if (mf.GetData().IsRegion()) {
116 m_strName = mf.GetData().GetRegion();
117 std::transform(m_strName.begin(), m_strName.end(), m_strName.begin(),
118 [](char c) -> char { return std::isspace(c) ? '_' : c; });
119 return true;
120 }
121 const auto& geneRef = sGetClosestGeneRef(mf);
122 if (geneRef.IsSetLocus()) {
123 m_strName = geneRef.GetLocus();
124 return true;
125 }
126 if (geneRef.IsSetLocus_tag()) {
127 m_strName = geneRef.GetLocus_tag();
128 return true;
129 }
130
131 // try harder only if the given feature itself is a gene:
132 if (!mf.GetData().IsGene()) {
133 return true;
134 }
135 if (geneRef.IsSetDesc()) {
136 m_strName = geneRef.GetDesc();
137 }
138 if (geneRef.IsSetSyn()) {
139 m_strName = geneRef.GetSyn().front();
140 return true;
141 }
142 return true;
143 }
144
145 // ----------------------------------------------------------------------------
AssignDisplayData(const CMappedFeat & mf,bool bUseScore)146 bool CBedFeatureRecord::AssignDisplayData(
147 const CMappedFeat& mf,
148 bool bUseScore )
149 // ----------------------------------------------------------------------------
150 {
151 if ( ! mf.GetData().IsUser() ) {
152 return false;
153 }
154 const CUser_object& uo = mf.GetData().GetUser();
155 if ( ! uo.IsSetType() || ! uo.GetType().IsStr() ||
156 uo.GetType().GetStr() != "Display Data" ) {
157 return false;
158 }
159 const vector< CRef< CUser_field > >& fields = uo.GetData();
160 vector< CRef< CUser_field > >::const_iterator it = fields.begin();
161 for ( ; it != fields.end(); ++it ) {
162 if ( ! (*it)->CanGetLabel() || ! (*it)->GetLabel().IsStr() ) {
163 continue;
164 }
165 string strLabel = (*it)->GetLabel().GetStr();
166 if ( strLabel == "name" ) {
167 BUMP( m_uColumnCount, 4 );
168 if ( (*it)->IsSetData() && (*it)->GetData().IsStr() ) {
169 m_strName = (*it)->GetData().GetStr();
170 }
171 continue;
172 }
173 if ( strLabel == "score" && ! bUseScore ) {
174 BUMP( m_uColumnCount, 5 );
175 if ( (*it)->IsSetData() && (*it)->GetData().IsInt() ) {
176 m_strScore = NStr::UIntToString( (*it)->GetData().GetInt() );
177 }
178 continue;
179 }
180 if ( strLabel == "greylevel" && bUseScore ) {
181 BUMP( m_uColumnCount, 5 );
182 if ( (*it)->IsSetData() && (*it)->GetData().IsInt() ) {
183 m_strScore = NStr::UIntToString( (*it)->GetData().GetInt() );
184 }
185 continue;
186 }
187 if ( strLabel == "thickStart" ) {
188 BUMP( m_uColumnCount, 7 );
189 if ( (*it)->IsSetData() && (*it)->GetData().IsInt() ) {
190 m_strThickStart = NStr::UIntToString( (*it)->GetData().GetInt());
191 }
192 continue;
193 }
194 if ( strLabel == "thickEnd" ) {
195 BUMP( m_uColumnCount, 8 );
196 if ( (*it)->IsSetData() && (*it)->GetData().IsInt() ) {
197 m_strThickEnd = NStr::UIntToString( (*it)->GetData().GetInt()+1);
198 }
199 continue;
200 }
201 if ( strLabel == "itemRGB" ) {
202 BUMP( m_uColumnCount, 9 );
203 if ( (*it)->IsSetData() && (*it)->GetData().IsInt() ) {
204 m_strItemRgb = NStr::UIntToString( (*it)->GetData().GetInt() );
205 continue;
206 }
207 if ( (*it)->IsSetData() && (*it)->GetData().IsStr() ) {
208 m_strItemRgb = (*it)->GetData().GetStr();
209 continue;
210 }
211 continue;
212 }
213 if ( strLabel == "blockCount" ) {
214 BUMP( m_uColumnCount, 10 );
215 if ( (*it)->IsSetData() && (*it)->GetData().IsInt() ) {
216 m_strBlockCount = NStr::UIntToString( (*it)->GetData().GetInt() );
217 }
218 continue;
219 }
220 if ( strLabel == "blockSizes" ) {
221 BUMP( m_uColumnCount, 11 );
222 if ( (*it)->IsSetData() && (*it)->GetData().IsStr() ) {
223 m_strBlockSizes = (*it)->GetData().GetStr();
224 }
225 continue;
226 }
227 if ( strLabel == "blockStarts" ) {
228 BUMP( m_uColumnCount, 12 );
229 if ( (*it)->IsSetData() && (*it)->GetData().IsStr() ) {
230 m_strBlockStarts = (*it)->GetData().GetStr();
231 }
232 continue;
233 }
234 }
235 return true;
236 }
237
238 // ----------------------------------------------------------------------------
AssignLocation(CScope & scope,const CSeq_interval & interval)239 bool CBedFeatureRecord::AssignLocation(
240 CScope& scope,
241 const CSeq_interval& interval )
242 // ----------------------------------------------------------------------------
243 {
244 if ( interval.CanGetId() ) {
245 m_strChrom = interval.GetId().GetSeqIdString(true);
246 try {
247 string bestId;
248 CGenbankIdResolve::Get().GetBestId(
249 CSeq_id_Handle::GetHandle(interval.GetId()), scope, bestId);
250 m_strChrom = bestId;
251 // its OK for this to silently fail.
252 // Seq-ids that dont look up are just meant to be output directly
253 } catch(...) { ; }
254 }
255 if ( interval.IsSetFrom() ) {
256 m_strChromStart = NStr::UIntToString( interval.GetFrom() );
257 }
258 if ( interval.IsSetTo() ) {
259 m_strChromEnd = NStr::UIntToString( interval.GetTo() + 1 );
260 }
261 BUMP( m_uColumnCount, 3 );
262 m_strStrand = "+";
263 if (interval.IsSetStrand()) {
264 ENa_strand strand = interval.GetStrand();
265 if (strand == eNa_strand_minus) {
266 m_strStrand = "-";
267 }
268 }
269 return true;
270 }
271
272 // ----------------------------------------------------------------------------
Write(CNcbiOstream & ostr,unsigned int columnCount)273 bool CBedFeatureRecord::Write(
274 CNcbiOstream& ostr,
275 unsigned int columnCount)
276 // ----------------------------------------------------------------------------
277 {
278 ostr << Chrom();
279 ostr << "\t" << ChromStart();
280 ostr << "\t" << ChromEnd();
281 if ( columnCount >= 4 ) {
282 ostr << "\t" << Name();
283 }
284 if ( columnCount >= 5 ) {
285 ostr << "\t" << Score();
286 }
287 if ( columnCount >= 6 ) {
288 ostr << "\t" << Strand();
289 }
290 if ( columnCount >= 7 ) {
291 ostr << "\t" << ThickStart();
292 }
293 if ( columnCount >= 8 ) {
294 ostr << "\t" << ThickEnd();
295 }
296 if ( columnCount >= 9 ) {
297 ostr << "\t" << ItemRgb();
298 }
299 if ( columnCount >= 10 ) {
300 ostr << "\t" << BlockCount();
301 }
302 if ( columnCount >= 11 ) {
303 ostr << "\t" << BlockSizes();
304 }
305 if ( columnCount >= 12 ) {
306 ostr << "\t" << BlockStarts();
307 }
308 ostr << '\n';
309 return true;
310 }
311
312
313 // ----------------------------------------------------------------------------
SetLocation(const CSeq_loc & loc)314 bool CBedFeatureRecord::SetLocation(
315 const CSeq_loc& loc)
316 // ----------------------------------------------------------------------------
317 {
318 string bestId;
319 if (!CGenbankIdResolve::Get().GetBestId(loc, bestId)) {
320 return false;
321 }
322 if (loc.IsInt()) {
323 const CSeq_interval& chromInt = loc.GetInt();
324 m_strChrom = bestId;
325 m_strChromStart = NStr::IntToString(chromInt.GetFrom());
326 m_strChromEnd = NStr::IntToString(chromInt.GetTo()+1);
327 m_strStrand = "+";
328 if (chromInt.IsSetStrand()) {
329 ENa_strand strand = chromInt.GetStrand();
330 if (strand == eNa_strand_minus) {
331 m_strStrand = "-";
332 }
333 }
334 return true;
335 }
336
337 if (loc.IsPnt()) {
338 const CSeq_point& chromPt = loc.GetPnt();
339 m_strChrom = bestId;
340 m_strChromStart = NStr::IntToString(chromPt.GetPoint());
341 m_strChromEnd = NStr::IntToString(chromPt.GetPoint()+1);
342 m_strStrand = "+";
343 if (chromPt.IsSetStrand()) {
344 ENa_strand strand = chromPt.GetStrand();
345 if (strand == eNa_strand_minus) {
346 m_strStrand = "-";
347 }
348 }
349 return true;
350 }
351 return false;
352 }
353
354 // -----------------------------------------------------------------------------
SetName(const CSeqFeatData & data)355 bool CBedFeatureRecord::SetName(
356 const CSeqFeatData& data)
357 // -----------------------------------------------------------------------------
358 {
359 if (data.IsRegion()) {
360 m_strName = data.GetRegion();
361 return true;
362 }
363 if (data.IsGene()) {
364 }
365 return false;
366 }
367
368 // -----------------------------------------------------------------------------
SetScore(int score)369 bool CBedFeatureRecord::SetScore(
370 int score)
371 // -----------------------------------------------------------------------------
372 {
373 m_strScore = NStr::IntToString(score);
374 return true;
375 }
376
377 // ----------------------------------------------------------------------------
SetThick(const CSeq_loc & loc)378 bool CBedFeatureRecord::SetThick(
379 const CSeq_loc& loc)
380 // ----------------------------------------------------------------------------
381 {
382 if (loc.IsInt()) {
383 const CSeq_interval& thickInt = loc.GetInt();
384 m_strThickStart = NStr::IntToString(thickInt.GetFrom());
385 m_strThickEnd = NStr::IntToString(thickInt.GetTo()+1);
386 return true;
387 }
388 if (loc.IsPnt()) {
389 const CSeq_point& thickPoint = loc.GetPnt();
390 m_strThickStart = NStr::IntToString(thickPoint.GetPoint());
391 m_strThickEnd = NStr::IntToString(thickPoint.GetPoint()+1);
392 return true;
393 }
394 return false;
395 }
396
397 // -----------------------------------------------------------------------------
SetBlocks(const CSeq_loc & chrom,const CSeq_loc & blocks)398 bool CBedFeatureRecord::SetBlocks(
399 const CSeq_loc& chrom,
400 const CSeq_loc& blocks)
401 // -----------------------------------------------------------------------------
402 {
403 if (blocks.IsInt()) {
404 return true;
405 }
406 if (blocks.IsPacked_int()) {
407 const list<CRef<CSeq_interval> >& intervals = blocks.GetPacked_int().Get();
408 ENa_strand strand = blocks.GetStrand();
409 int offset = chrom.GetStart(eExtreme_Positional);
410 list<string> blockStarts;
411 list<string> blockSizes;
412
413 for (auto pInterval: intervals) {
414 const CSeq_interval& interval = *pInterval;
415 if (strand == eNa_strand_minus) {
416 blockStarts.push_front(NStr::NumericToString(interval.GetFrom()-offset));
417 blockSizes.push_front(NStr::NumericToString(interval.GetLength()));
418 }
419 else {
420 blockStarts.push_back(NStr::NumericToString(interval.GetFrom()-offset));
421 blockSizes.push_back(NStr::NumericToString(interval.GetLength()));
422 }
423 }
424 m_strBlockCount = NStr::NumericToString(intervals.size());
425 m_strBlockStarts = NStr::Join(blockStarts, ",");
426 m_strBlockSizes = NStr::Join(blockSizes, ",");
427 return true;
428 }
429 return false;
430 }
431
432 // -----------------------------------------------------------------------------
SetNoThick(const CSeq_loc & loc)433 bool CBedFeatureRecord::SetNoThick(
434 const CSeq_loc& loc)
435 // -----------------------------------------------------------------------------
436 {
437 if (loc.IsInt()) {
438 const CSeq_interval& chromInt = loc.GetInt();
439 m_strThickStart = NStr::IntToString(chromInt.GetFrom());
440 m_strThickEnd = NStr::IntToString(chromInt.GetFrom());
441 return true;
442 }
443
444 if (loc.IsPnt()) {
445 const CSeq_point& chromPt = loc.GetPnt();
446 m_strThickStart = NStr::IntToString(chromPt.GetPoint());
447 m_strThickEnd = NStr::IntToString(chromPt.GetPoint());
448 return true;
449 }
450 return false;
451 }
452
453 // -----------------------------------------------------------------------------
SetRgb(const string & color)454 bool CBedFeatureRecord::SetRgb(
455 const string& color)
456 // -----------------------------------------------------------------------------
457 {
458 if (color == "0 0 0") {
459 m_strItemRgb = "0";
460 return true;
461 }
462 vector<string> rgb;
463 NStr::Split(color, " ", rgb);
464 m_strItemRgb = NStr::Join(rgb, ",");
465 return true;
466 }
467
468 // -----------------------------------------------------------------------------
469
470 END_NCBI_SCOPE
471