1 /* $Id: wig2table.cpp 637992 2021-09-21 18:48:18Z ivanov $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Aaron Ucko
27 *
28 * File Description:
29 * Simple program demonstrating the use of serializable objects (in this
30 * case, biological sequences). Does NOT use the object manager.
31 *
32 * ===========================================================================
33 */
34
35 #include <ncbi_pch.hpp>
36 #include <corelib/ncbiapp.hpp>
37 #include <corelib/ncbiargs.hpp>
38 #include <corelib/ncbienv.hpp>
39
40 #include <serial/iterator.hpp>
41 #include <serial/objistr.hpp>
42 #include <serial/objostr.hpp>
43 #include <serial/serial.hpp>
44
45 #include <objects/general/Object_id.hpp>
46 #include <objects/general/User_object.hpp>
47 #include <objects/general/User_field.hpp>
48 #include <objects/seqloc/seqloc__.hpp>
49 #include <objects/seqtable/seqtable__.hpp>
50 #include <objects/seqres/seqres__.hpp>
51 #include <objects/seq/Seq_annot.hpp>
52 #include <objects/seq/Annot_descr.hpp>
53 #include <objects/seq/Annotdesc.hpp>
54 #include <objtools/readers/idmapper.hpp>
55 #include <corelib/ncbifile.hpp>
56
57 USING_SCOPE(ncbi);
58 USING_SCOPE(objects);
59
60 class CWigBufferedLineReader
61 {
62 public:
63 /// read from the file, "-" (but not "./-") means standard input
64 CWigBufferedLineReader(const string& filename);
65 virtual ~CWigBufferedLineReader();
66
67 bool AtEOF(void) const;
68 CWigBufferedLineReader& operator++(void);
69 void UngetLine(void);
70 CTempStringEx operator*(void) const;
71 CT_POS_TYPE GetPosition(void) const;
72 Uint8 GetLineNumber(void) const;
73
74 protected:
75
76 void x_LoadLong(void);
77 bool x_ReadBuffer(void);
78
79 private:
80 AutoPtr<IReader> m_Reader;
81 bool m_Eof;
82 bool m_UngetLine;
83 size_t m_BufferSize;
84 AutoArray<char> m_Buffer;
85 char* m_Pos;
86 char* m_End;
87 CTempStringEx m_Line;
88 string m_String;
89 CT_POS_TYPE m_InputPos;
90 Uint8 m_LineNumber;
91 };
92
93
CWigBufferedLineReader(const string & filename)94 CWigBufferedLineReader::CWigBufferedLineReader(const string& filename)
95 : m_Reader(CFileReader::New(filename)),
96 m_Eof(false),
97 m_UngetLine(false),
98 m_BufferSize(32*1024),
99 m_Buffer(new char[m_BufferSize]),
100 m_Pos(m_Buffer.get()),
101 m_End(m_Pos),
102 m_InputPos(0),
103 m_LineNumber(0)
104 {
105 x_ReadBuffer();
106 }
107
108
~CWigBufferedLineReader()109 CWigBufferedLineReader::~CWigBufferedLineReader()
110 {
111 }
112
113
AtEOF(void) const114 bool CWigBufferedLineReader::AtEOF(void) const
115 {
116 return m_Eof && !m_UngetLine;
117 }
118
119
UngetLine(void)120 void CWigBufferedLineReader::UngetLine(void)
121 {
122 _ASSERT(!m_UngetLine);
123 _ASSERT(m_Line.begin());
124 --m_LineNumber;
125 m_UngetLine = true;
126 }
127
128
operator ++(void)129 CWigBufferedLineReader& CWigBufferedLineReader::operator++(void)
130 {
131 ++m_LineNumber;
132 if ( m_UngetLine ) {
133 _ASSERT(m_Line.begin());
134 m_UngetLine = false;
135 return *this;
136 }
137 // check if we are at the buffer end
138 char* start = m_Pos;
139 char* end = m_End;
140 for ( char* p = start; p < end; ++p ) {
141 if ( *p == '\n' ) {
142 *p = '\0';
143 m_Line.assign(start, p - start, CTempStringEx::eHasZeroAtEnd);
144 m_Pos = ++p;
145 if ( p == end ) {
146 m_String = m_Line;
147 m_Line = m_String;
148 x_ReadBuffer();
149 }
150 return *this;
151 }
152 else if ( *p == '\r' ) {
153 *p = '\0';
154 m_Line.assign(start, p - start, CTempStringEx::eHasZeroAtEnd);
155 if ( ++p == end ) {
156 m_String = m_Line;
157 m_Line = m_String;
158 if ( x_ReadBuffer() ) {
159 p = m_Pos;
160 if ( *p == '\n' ) {
161 m_Pos = p+1;
162 }
163 }
164 return *this;
165 }
166 if ( *p != '\n' ) {
167 return *this;
168 }
169 m_Pos = ++p;
170 if ( p == end ) {
171 m_String = m_Line;
172 m_Line = m_String;
173 x_ReadBuffer();
174 }
175 return *this;
176 }
177 }
178 x_LoadLong();
179 return *this;
180 }
181
182
x_LoadLong(void)183 void CWigBufferedLineReader::x_LoadLong(void)
184 {
185 char* start = m_Pos;
186 char* end = m_End;
187 m_String.assign(start, end);
188 while ( x_ReadBuffer() ) {
189 start = m_Pos;
190 end = m_End;
191 for ( char* p = start; p < end; ++p ) {
192 char c = *p;
193 if ( c == '\r' || c == '\n' ) {
194 m_String.append(start, p - start);
195 m_Line = m_String;
196 if ( ++p == end ) {
197 m_String = m_Line;
198 m_Line = m_String;
199 if ( x_ReadBuffer() ) {
200 p = m_Pos;
201 end = m_End;
202 if ( p < end && c == '\r' && *p == '\n' ) {
203 ++p;
204 m_Pos = p;
205 }
206 }
207 }
208 else {
209 if ( c == '\r' && *p == '\n' ) {
210 if ( ++p == end ) {
211 x_ReadBuffer();
212 p = m_Pos;
213 }
214 }
215 m_Pos = p;
216 }
217 return;
218 }
219 }
220 m_String.append(start, end - start);
221 }
222 m_Line = m_String;
223 return;
224 }
225
226
x_ReadBuffer()227 bool CWigBufferedLineReader::x_ReadBuffer()
228 {
229 _ASSERT(m_Reader);
230
231 if ( m_Eof ) {
232 return false;
233 }
234
235 m_InputPos += CT_OFF_TYPE(m_End - m_Buffer.get());
236 m_Pos = m_End = m_Buffer.get();
237 for (bool flag = true; flag; ) {
238 size_t size;
239 ERW_Result result =
240 m_Reader->Read(m_Buffer.get(), m_BufferSize, &size);
241 switch (result) {
242 case eRW_NotImplemented:
243 case eRW_Error:
244 NCBI_THROW(CIOException, eRead, "Read error");
245 /*NOTREACHED*/
246 break;
247 case eRW_Timeout:
248 // keep spinning around
249 break;
250 case eRW_Eof:
251 m_Eof = true;
252 // fall through
253 case eRW_Success:
254 m_End = m_Pos + size;
255 return (result == eRW_Success || size > 0);
256 default:
257 _ASSERT(0);
258 }
259 } // for
260 return false;
261 }
262
263
operator *(void) const264 CTempStringEx CWigBufferedLineReader::operator*(void) const
265 {
266 return m_Line;
267 }
268
269
GetPosition(void) const270 CT_POS_TYPE CWigBufferedLineReader::GetPosition(void) const
271 {
272 return m_InputPos + CT_OFF_TYPE(m_Pos - m_Buffer.get());
273 }
274
275
GetLineNumber(void) const276 Uint8 CWigBufferedLineReader::GetLineNumber(void) const
277 {
278 return m_LineNumber;
279 }
280
281
282 class CWig2tableApplication : public CNcbiApplication
283 {
284 virtual void Init(void);
285 virtual int Run(void);
286
287 CRef<CSeq_table> MakeTable(void);
288 CRef<CSeq_graph> MakeGraph(void);
289 CRef<CSeq_annot> MakeAnnot(void);
290 CRef<CSeq_annot> MakeTableAnnot(void);
291 CRef<CSeq_annot> MakeGraphAnnot(void);
292 void DumpAnnot(void);
293 void DumpChromValues(void);
294 void ResetChromValues(void);
295 void SetChrom(CTempString chrom);
296
297 double EstimateSize(size_t rows, bool fixed_span) const;
298
299 void ReadBrowser(void);
300 void ReadTrack(void);
301 void ReadFixedStep(void);
302 void ReadVariableStep(void);
303 void ReadBedLine(CTempString chrom);
304
305 AutoPtr<CWigBufferedLineReader> m_Input;
306 AutoPtr<CObjectOStream> m_Output;
307
308 CTempStringEx m_CurLine;
309
310 bool x_GetLine(void);
311 void x_UngetLine(void);
312 bool x_SkipWS(void);
313 bool x_CommentLine(void) const;
314 CTempString x_GetWord(void);
315 CTempString x_GetParamName(void);
316 CTempString x_GetParamValue(void);
317 bool x_TryGetPos(TSeqPos& v);
318 bool x_TryGetDoubleSimple(double& v);
319 bool x_TryGetDouble(double& v);
320 void x_GetPos(TSeqPos& v);
321 void x_GetDouble(double& v);
322
323 void x_Error(const char* msg);
324
325 string m_ChromId;
326 struct SValueInfo {
327 TSeqPos m_Pos;
328 TSeqPos m_Span;
329 double m_Value;
330
GetEndCWig2tableApplication::SValueInfo331 TSeqPos GetEnd(void) const {
332 return m_Pos + m_Span;
333 }
operator <CWig2tableApplication::SValueInfo334 bool operator<(const SValueInfo& v) const {
335 return m_Pos < v.m_Pos;
336 }
337 };
338
339 struct SStat {
340 bool m_FixedSpan;
341 bool m_HaveGaps;
342 bool m_IntValues;
343 TSeqPos m_Span;
344 double m_Min, m_Max, m_Step, m_StepMul;
345
SStatCWig2tableApplication::SStat346 SStat(void)
347 : m_FixedSpan(true),
348 m_HaveGaps(false),
349 m_IntValues(true),
350 m_Span(1),
351 m_Min(0),
352 m_Max(0),
353 m_Step(1),
354 m_StepMul(1)
355 {
356 }
SetFirstSpanCWig2tableApplication::SStat357 void SetFirstSpan(TSeqPos span)
358 {
359 m_FixedSpan = true;
360 m_Span = span;
361 }
AddSpanCWig2tableApplication::SStat362 void AddSpan(TSeqPos span)
363 {
364 if ( span != m_Span ) {
365 m_FixedSpan = false;
366 }
367 }
SetFirstValueCWig2tableApplication::SStat368 void SetFirstValue(double v)
369 {
370 m_Min = m_Max = v;
371 m_IntValues = v == int(v);
372 }
AddValueCWig2tableApplication::SStat373 void AddValue(double v)
374 {
375 if ( v < m_Min ) {
376 m_Min = v;
377 }
378 if ( v > m_Max ) {
379 m_Max = v;
380 }
381 if ( m_IntValues && v != int(v) ) {
382 m_IntValues = false;
383 }
384 }
AsByteCWig2tableApplication::SStat385 int AsByte(double v) const
386 {
387 return int((v-m_Min)*m_StepMul+.5);
388 }
389 };
390 // sort values and return min and max values
391 SStat x_PreprocessValues(void);
392 CRef<CSeq_id> x_MakeChromId(void);
393 void x_SetTotalLoc(CSeq_loc& loc, CSeq_id& chrom_id);
394
AddValue(const SValueInfo & value)395 void AddValue(const SValueInfo& value) {
396 if ( !m_OmitZeros || value.m_Value != 0 ) {
397 m_Values.push_back(value);
398 }
399 }
400
401 string m_TrackName;
402 string m_TrackDescription;
403 string m_TrackTypeValue;
404 enum ETrackType {
405 eTrackType_invalid,
406 eTrackType_wiggle_0,
407 eTrackType_bedGraph
408 };
409 ETrackType m_TrackType;
410 typedef map<string, string> TTrackParams;
411 TTrackParams m_TrackParams;
412 typedef vector<SValueInfo> TValues;
413 TValues m_Values;
414 CRef<CSeq_annot> m_Annot;
415
416 bool m_AsGraph;
417 bool m_SingleAnnot;
418 bool m_OmitZeros;
419 bool m_JoinSame;
420 bool m_AsByte;
421 bool m_KeepInteger;
422 double m_GapValue;
423 AutoPtr<IIdMapper> m_IdMapper;
424 };
425
426
Init(void)427 void CWig2tableApplication::Init(void)
428 {
429 // Create command-line argument descriptions class
430 auto_ptr<CArgDescriptions> arg_desc(new CArgDescriptions);
431
432 // Specify USAGE context
433 arg_desc->SetUsageContext
434 (GetArguments().GetProgramBasename(),
435 "Object serialization demo program: Seq-entry translator");
436
437 // Describe the expected command-line arguments
438 arg_desc->AddDefaultKey
439 ("input", "InputFile",
440 "name of file to read from (standard input by default)",
441 CArgDescriptions::eInputFile, "-", CArgDescriptions::fPreOpen);
442
443 arg_desc->AddDefaultKey
444 ("output", "OutputFile",
445 "name of file to write to (standard output by default)",
446 CArgDescriptions::eOutputFile, "-");
447 arg_desc->AddDefaultKey("outfmt", "OutputFormat", "format of output file",
448 CArgDescriptions::eString, "asn");
449 arg_desc->SetConstraint("outfmt",
450 &(*new CArgAllow_Strings, "asn", "asnb", "xml"));
451
452 arg_desc->AddOptionalKey("mapfile", "MapFile",
453 "IdMapper config filename",
454 CArgDescriptions::eInputFile);
455 arg_desc->AddDefaultKey("genome", "Genome",
456 "UCSC build number",
457 CArgDescriptions::eString, "");
458
459 arg_desc->AddFlag("as-graph",
460 "Generate Seq-graph");
461 arg_desc->AddFlag("single-annot",
462 "Put all Seq-graphs in a single Seq-annot");
463 arg_desc->AddFlag("as-byte",
464 "Convert data in byte range");
465 arg_desc->AddFlag("omit-zeros",
466 "Omit zero values");
467 arg_desc->AddFlag("join-same",
468 "Join equal sequential values");
469 arg_desc->AddFlag("keep-integer",
470 "Keep integer as is if they fit in an output range");
471 arg_desc->AddDefaultKey("gap-value", "GapValue",
472 "Assumed value in gaps",
473 CArgDescriptions::eDouble, "0");
474
475 arg_desc->AddOptionalKey("name", "Name",
476 "Track name or graph title",
477 CArgDescriptions::eString);
478
479 // Setup arg.descriptions for this application
480 SetupArgDescriptions(arg_desc.release());
481 }
482
483
s_GetFormat(const string & name)484 static ESerialDataFormat s_GetFormat(const string& name)
485 {
486 if (name == "asn") {
487 return eSerial_AsnText;
488 } else if (name == "asnb") {
489 return eSerial_AsnBinary;
490 } else if (name == "xml") {
491 return eSerial_Xml;
492 } else {
493 // Should be caught by argument processing, but as an illustration...
494 THROW1_TRACE(runtime_error, "Bad serial format name " + name);
495 }
496 }
497
498
EstimateSize(size_t rows,bool fixed_span) const499 double CWig2tableApplication::EstimateSize(size_t rows, bool fixed_span) const
500 {
501 double ret = 0;
502 ret += rows*4;
503 if ( !fixed_span )
504 ret += rows*4;
505 if ( m_AsByte )
506 ret += rows;
507 else
508 ret += 8*rows;
509 return ret;
510 }
511
512
x_PreprocessValues(void)513 CWig2tableApplication::SStat CWig2tableApplication::x_PreprocessValues(void)
514 {
515 SStat stat;
516 bool sorted = true;
517 size_t size = m_Values.size();
518 if ( size ) {
519 stat.SetFirstSpan(m_Values[0].m_Span);
520 stat.SetFirstValue(m_Values[0].m_Value);
521
522 for ( size_t i = 1; i < size; ++i ) {
523 stat.AddSpan(m_Values[i].m_Span);
524 stat.AddValue(m_Values[i].m_Value);
525 if ( sorted ) {
526 if ( m_Values[i].m_Pos < m_Values[i-1].m_Pos ) {
527 sorted = false;
528 }
529 if ( m_Values[i].m_Pos != m_Values[i-1].GetEnd() ) {
530 stat.m_HaveGaps = true;
531 }
532 }
533 }
534 }
535 if ( !sorted ) {
536 sort(m_Values.begin(), m_Values.end());
537 stat.m_HaveGaps = false;
538 for ( size_t i = 1; i < size; ++i ) {
539 if ( m_Values[i].m_Pos != m_Values[i-1].GetEnd() ) {
540 stat.m_HaveGaps = true;
541 break;
542 }
543 }
544 }
545 if ( m_AsGraph && stat.m_HaveGaps ) {
546 stat.AddValue(m_GapValue);
547 }
548
549 const int range = 255;
550 if ( stat.m_Max > stat.m_Min &&
551 (!m_KeepInteger ||
552 !stat.m_IntValues ||
553 stat.m_Max-stat.m_Min > range) ) {
554 stat.m_Step = (stat.m_Max-stat.m_Min)/range;
555 stat.m_StepMul = 1/stat.m_Step;
556 }
557
558 if ( !m_AsGraph && m_JoinSame && size ) {
559 TValues nv;
560 nv.reserve(size);
561 nv.push_back(m_Values[0]);
562 for ( size_t i = 1; i < size; ++i ) {
563 if ( m_Values[i].m_Pos == nv.back().GetEnd() &&
564 m_Values[i].m_Value == nv.back().m_Value ) {
565 nv.back().m_Span += m_Values[i].m_Span;
566 }
567 else {
568 nv.push_back(m_Values[i]);
569 }
570 }
571 if ( nv.size() != size ) {
572 double s = EstimateSize(size, stat.m_FixedSpan);
573 double ns = EstimateSize(nv.size(), false);
574 if ( ns < s*.75 ) {
575 m_Values.swap(nv);
576 size = m_Values.size();
577 LOG_POST("Joined size: "<<size);
578 stat.m_FixedSpan = false;
579 }
580 }
581 }
582
583 if ( m_AsGraph && !stat.m_FixedSpan ) {
584 stat.m_Span = 1;
585 stat.m_FixedSpan = true;
586 }
587
588 return stat;
589 }
590
591
x_MakeChromId(void)592 CRef<CSeq_id> CWig2tableApplication::x_MakeChromId(void)
593 {
594 CRef<CSeq_id> chrom_id(new CSeq_id(CSeq_id::e_Local, m_ChromId));
595 if ( m_IdMapper ) {
596 m_IdMapper->MapObject(*chrom_id);
597 }
598 return chrom_id;
599 }
600
601
x_SetTotalLoc(CSeq_loc & loc,CSeq_id & chrom_id)602 void CWig2tableApplication::x_SetTotalLoc(CSeq_loc& loc, CSeq_id& chrom_id)
603 {
604 if ( m_Values.empty() ) {
605 loc.SetEmpty(chrom_id);
606 }
607 else {
608 CSeq_interval& interval = loc.SetInt();
609 interval.SetId(chrom_id);
610 interval.SetFrom(m_Values.front().m_Pos);
611 interval.SetTo(m_Values.back().GetEnd()-1);
612 }
613 }
614
615
MakeTable(void)616 CRef<CSeq_table> CWig2tableApplication::MakeTable(void)
617 {
618 CRef<CSeq_table> table(new CSeq_table);
619
620 table->SetFeat_type(0);
621
622 CRef<CSeq_id> chrom_id = x_MakeChromId();
623
624 CRef<CSeq_loc> table_loc(new CSeq_loc);
625 { // Seq-table location
626 CRef<CSeqTable_column> col_id(new CSeqTable_column);
627 table->SetColumns().push_back(col_id);
628 col_id->SetHeader().SetField_name("Seq-table location");
629 col_id->SetDefault().SetLoc(*table_loc);
630 }
631
632 { // Seq-id
633 CRef<CSeqTable_column> col_id(new CSeqTable_column);
634 table->SetColumns().push_back(col_id);
635 col_id->SetHeader().SetField_id(CSeqTable_column_info::eField_id_location_id);
636 col_id->SetDefault().SetId(*chrom_id);
637 }
638
639 // position
640 CRef<CSeqTable_column> col_pos(new CSeqTable_column);
641 table->SetColumns().push_back(col_pos);
642 col_pos->SetHeader().SetField_id(CSeqTable_column_info::eField_id_location_from);
643 CSeqTable_multi_data::TInt& pos = col_pos->SetData().SetInt();
644
645 SStat stat = x_PreprocessValues();
646
647 x_SetTotalLoc(*table_loc, *chrom_id);
648
649 size_t size = m_Values.size();
650 table->SetNum_rows(size);
651 pos.reserve(size);
652
653 CSeqTable_multi_data::TInt* span_ptr = 0;
654 { // span
655 CRef<CSeqTable_column> col_span(new CSeqTable_column);
656 table->SetColumns().push_back(col_span);
657 col_span->SetHeader().SetField_name("span");
658 if ( stat.m_FixedSpan ) {
659 col_span->SetDefault().SetInt(stat.m_Span);
660 }
661 else {
662 span_ptr = &col_span->SetData().SetInt();
663 span_ptr->reserve(size);
664 }
665 }
666
667 if ( stat.m_HaveGaps ) {
668 CRef<CSeqTable_column> col_step(new CSeqTable_column);
669 table->SetColumns().push_back(col_step);
670 col_step->SetHeader().SetField_name("value_gap");
671 col_step->SetDefault().SetReal(m_GapValue);
672 }
673
674 if ( m_AsByte ) { // values
675 CRef<CSeqTable_column> col_min(new CSeqTable_column);
676 table->SetColumns().push_back(col_min);
677 col_min->SetHeader().SetField_name("value_min");
678 col_min->SetDefault().SetReal(stat.m_Min);
679
680 CRef<CSeqTable_column> col_step(new CSeqTable_column);
681 table->SetColumns().push_back(col_step);
682 col_step->SetHeader().SetField_name("value_step");
683 col_step->SetDefault().SetReal(stat.m_Step);
684
685 CRef<CSeqTable_column> col_val(new CSeqTable_column);
686 table->SetColumns().push_back(col_val);
687 col_val->SetHeader().SetField_name("values");
688
689 if ( 1 ) {
690 AutoPtr< vector<char> > values(new vector<char>());
691 values->reserve(size);
692 ITERATE ( TValues, it, m_Values ) {
693 pos.push_back(it->m_Pos);
694 if ( span_ptr ) {
695 span_ptr->push_back(it->m_Span);
696 }
697 values->push_back(stat.AsByte(it->m_Value));
698 }
699 col_val->SetData().SetBytes().push_back(values.release());
700 }
701 else {
702 CSeqTable_multi_data::TInt& values = col_val->SetData().SetInt();
703 values.reserve(size);
704
705 ITERATE ( TValues, it, m_Values ) {
706 pos.push_back(it->m_Pos);
707 if ( span_ptr ) {
708 span_ptr->push_back(it->m_Span);
709 }
710 values.push_back(stat.AsByte(it->m_Value));
711 }
712 }
713 }
714 else {
715 CRef<CSeqTable_column> col_val(new CSeqTable_column);
716 table->SetColumns().push_back(col_val);
717 col_val->SetHeader().SetField_name("values");
718 CSeqTable_multi_data::TReal& values = col_val->SetData().SetReal();
719 values.reserve(size);
720
721 ITERATE ( TValues, it, m_Values ) {
722 pos.push_back(it->m_Pos);
723 if ( span_ptr ) {
724 span_ptr->push_back(it->m_Span);
725 }
726 values.push_back(it->m_Value);
727 }
728 }
729 return table;
730 }
731
732
MakeGraph(void)733 CRef<CSeq_graph> CWig2tableApplication::MakeGraph(void)
734 {
735 CRef<CSeq_graph> graph(new CSeq_graph);
736
737 CRef<CSeq_id> chrom_id = x_MakeChromId();
738
739 CRef<CSeq_loc> graph_loc(new CSeq_loc);
740 graph->SetLoc(*graph_loc);
741
742 SStat stat = x_PreprocessValues();
743
744 x_SetTotalLoc(*graph_loc, *chrom_id);
745
746 if ( !m_TrackName.empty() ) {
747 graph->SetTitle(m_TrackName);
748 }
749 graph->SetComp(stat.m_Span);
750 graph->SetA(stat.m_Step);
751 graph->SetB(stat.m_Min);
752
753 CByte_graph& b_graph = graph->SetGraph().SetByte();
754 b_graph.SetMin(stat.AsByte(stat.m_Min));
755 b_graph.SetMax(stat.AsByte(stat.m_Max));
756 b_graph.SetAxis(0);
757 vector<char>& bytes = b_graph.SetValues();
758
759 if ( m_Values.empty() ) {
760 graph->SetNumval(0);
761 }
762 else {
763 _ASSERT(stat.m_FixedSpan);
764 TSeqPos start = m_Values[0].m_Pos;
765 TSeqPos end = m_Values.back().GetEnd();
766 size_t size = (end-start)/stat.m_Span;
767 graph->SetNumval(size);
768 bytes.resize(size, stat.AsByte(m_GapValue));
769 ITERATE ( TValues, it, m_Values ) {
770 TSeqPos pos = it->m_Pos - start;
771 TSeqPos span = it->m_Span;
772 _ASSERT(pos % stat.m_Span == 0);
773 _ASSERT(span % stat.m_Span == 0);
774 size_t i = pos / stat.m_Span;
775 int v = stat.AsByte(it->m_Value);
776 for ( ; span > 0; span -= stat.m_Span, ++i ) {
777 bytes[i] = v;
778 }
779 }
780 }
781 return graph;
782 }
783
784
MakeAnnot(void)785 CRef<CSeq_annot> CWig2tableApplication::MakeAnnot(void)
786 {
787 CRef<CSeq_annot> annot(new CSeq_annot);
788 if ( !m_TrackDescription.empty() ) {
789 CRef<CAnnotdesc> desc(new CAnnotdesc);
790 desc->SetTitle(m_TrackDescription);
791 annot->SetDesc().Set().push_back(desc);
792 }
793 if ( !m_TrackName.empty() ) {
794 CRef<CAnnotdesc> desc(new CAnnotdesc);
795 desc->SetName(m_TrackName);
796 annot->SetDesc().Set().push_back(desc);
797 }
798 if ( true ) {
799 m_TrackParams["track type"] = "graph";
800 }
801 if ( !m_TrackParams.empty() ) {
802 CRef<CAnnotdesc> desc(new CAnnotdesc);
803 annot->SetDesc().Set().push_back(desc);
804 CUser_object& user = desc->SetUser();
805 user.SetType().SetStr("Track Data");
806 ITERATE ( TTrackParams, it, m_TrackParams ) {
807 CRef<CUser_field> field(new CUser_field);
808 field->SetLabel().SetStr(it->first);
809 field->SetData().SetStr(it->second);
810 user.SetData().push_back(field);
811 }
812 }
813 return annot;
814 }
815
816
MakeTableAnnot(void)817 CRef<CSeq_annot> CWig2tableApplication::MakeTableAnnot(void)
818 {
819 CRef<CSeq_annot> annot = MakeAnnot();
820 annot->SetData().SetSeq_table(*MakeTable());
821 return annot;
822 }
823
824
MakeGraphAnnot(void)825 CRef<CSeq_annot> CWig2tableApplication::MakeGraphAnnot(void)
826 {
827 CRef<CSeq_annot> annot = MakeAnnot();
828 annot->SetData().SetGraph().push_back(MakeGraph());
829 return annot;
830 }
831
832
ResetChromValues(void)833 void CWig2tableApplication::ResetChromValues(void)
834 {
835 m_ChromId.clear();
836 m_Values.clear();
837 }
838
839
x_Error(const char * msg)840 void CWig2tableApplication::x_Error(const char* msg)
841 {
842 ERR_FATAL(GetArgs()["input"].AsString()<<":"<<m_Input->GetLineNumber()<<
843 ": "<<msg<<": \""<<m_CurLine<<"\"");
844 }
845
846
x_SkipWS(void)847 bool CWig2tableApplication::x_SkipWS(void)
848 {
849 const char* ptr = m_CurLine.data();
850 size_t skip = 0;
851 for ( size_t len = m_CurLine.size(); skip < len; ++skip ) {
852 char c = ptr[skip];
853 if ( c != ' ' && c != '\t' ) {
854 break;
855 }
856 }
857 m_CurLine = m_CurLine.substr(skip);
858 _ASSERT(m_CurLine.HasZeroAtEnd());
859 return !m_CurLine.empty();
860 }
861
862
x_CommentLine(void) const863 inline bool CWig2tableApplication::x_CommentLine(void) const
864 {
865 char c = m_CurLine.data()[0];
866 return c == '#';// || c == '\0';
867 }
868
869
x_GetWord(void)870 CTempString CWig2tableApplication::x_GetWord(void)
871 {
872 const char* ptr = m_CurLine.data();
873 size_t skip = 0;
874 for ( size_t len = m_CurLine.size(); skip < len; ++skip ) {
875 char c = ptr[skip];
876 if ( c == ' ' || c == '\t' ) {
877 break;
878 }
879 }
880 if ( skip == 0 ) {
881 x_Error("Identifier expected");
882 }
883 m_CurLine = m_CurLine.substr(skip);
884 _ASSERT(m_CurLine.HasZeroAtEnd());
885 return CTempString(ptr, skip);
886 }
887
888
x_GetParamName(void)889 CTempString CWig2tableApplication::x_GetParamName(void)
890 {
891 const char* ptr = m_CurLine.data();
892 size_t skip = 0;
893 for ( size_t len = m_CurLine.size(); skip < len; ++skip ) {
894 char c = ptr[skip];
895 if ( c == '=' ) {
896 m_CurLine = m_CurLine.substr(skip+1);
897 _ASSERT(m_CurLine.HasZeroAtEnd());
898 return CTempString(ptr, skip);
899 }
900 if ( c == ' ' || c == '\t' ) {
901 break;
902 }
903 }
904 x_Error("'=' expected");
905 return CTempString();
906 }
907
908
x_GetParamValue(void)909 CTempString CWig2tableApplication::x_GetParamValue(void)
910 {
911 const char* ptr = m_CurLine.data();
912 size_t len = m_CurLine.size();
913 if ( len && *ptr == '"' ) {
914 size_t pos = 1;
915 for ( ; pos < len; ++pos ) {
916 char c = ptr[pos];
917 if ( c == '"' ) {
918 m_CurLine = m_CurLine.substr(pos+1);
919 _ASSERT(m_CurLine.HasZeroAtEnd());
920 return CTempString(ptr+1, pos-1);
921 }
922 }
923 x_Error("Open quotes");
924 }
925 return x_GetWord();
926 }
927
928
x_GetPos(TSeqPos & v)929 void CWig2tableApplication::x_GetPos(TSeqPos& v)
930 {
931 TSeqPos ret = 0;
932 const char* ptr = m_CurLine.data();
933 for ( size_t skip = 0; ; ++skip ) {
934 char c = ptr[skip];
935 if ( c >= '0' && c <= '9' ) {
936 ret = ret*10 + (c-'0');
937 }
938 else if ( (c == ' ' || c == '\t' || c == '\0') && skip ) {
939 m_CurLine = m_CurLine.substr(skip);
940 _ASSERT(m_CurLine.HasZeroAtEnd());
941 v = ret;
942 return;
943 }
944 else {
945 x_Error("Integer value expected");
946 }
947 }
948 }
949
950
x_TryGetDoubleSimple(double & v)951 bool CWig2tableApplication::x_TryGetDoubleSimple(double& v)
952 {
953 double ret = 0;
954 const char* ptr = m_CurLine.data();
955 size_t skip = 0;
956 bool negate = false, digits = false;
957 for ( ; ; ++skip ) {
958 char c = ptr[skip];
959 if ( !skip ) {
960 if ( c == '-' ) {
961 negate = true;
962 continue;
963 }
964 if ( c == '+' ) {
965 continue;
966 }
967 }
968 if ( c >= '0' && c <= '9' ) {
969 digits = true;
970 ret = ret*10 + (c-'0');
971 }
972 else if ( c == '.' ) {
973 ++skip;
974 break;
975 }
976 else if ( c == '\0' ) {
977 if ( !digits ) {
978 return false;
979 }
980 m_CurLine.clear();
981 _ASSERT(m_CurLine.HasZeroAtEnd());
982 if ( negate ) {
983 ret = -ret;
984 }
985 v = ret;
986 return true;
987 }
988 else {
989 return false;
990 }
991 }
992 double digit_mul = 1;
993 for ( ; ; ++skip ) {
994 char c = ptr[skip];
995 if ( c >= '0' && c <= '9' ) {
996 digits = true;
997 digit_mul *= .1;
998 ret += (c-'0')*digit_mul;
999 }
1000 else if ( (c == ' ' || c == '\t' || c == '\0') && digits ) {
1001 m_CurLine.clear();
1002 _ASSERT(m_CurLine.HasZeroAtEnd());
1003 v = ret;
1004 if ( negate ) {
1005 ret = -ret;
1006 }
1007 return true;
1008 }
1009 else {
1010 return false;
1011 }
1012 }
1013 }
1014
1015
x_TryGetDouble(double & v)1016 bool CWig2tableApplication::x_TryGetDouble(double& v)
1017 {
1018 if ( x_TryGetDoubleSimple(v) ) {
1019 return true;
1020 }
1021 const char* ptr = m_CurLine.data();
1022 char* endptr = 0;
1023 v = strtod(ptr, &endptr);
1024 if ( endptr == ptr ) {
1025 return false;
1026 }
1027 if ( *endptr ) {
1028 x_Error("extra text in line");
1029 }
1030 m_CurLine.clear();
1031 _ASSERT(m_CurLine.HasZeroAtEnd());
1032 return true;
1033 }
1034
1035
x_TryGetPos(TSeqPos & v)1036 inline bool CWig2tableApplication::x_TryGetPos(TSeqPos& v)
1037 {
1038 char c = m_CurLine.data()[0];
1039 if ( c < '0' || c > '9' ) {
1040 return false;
1041 }
1042 x_GetPos(v);
1043 return true;
1044 }
1045
1046
x_GetDouble(double & v)1047 inline void CWig2tableApplication::x_GetDouble(double& v)
1048 {
1049 if ( !x_TryGetDouble(v) ) {
1050 x_Error("Floating point value expected");
1051 }
1052 }
1053
1054
x_GetLine()1055 bool CWig2tableApplication::x_GetLine()
1056 {
1057 if ( m_Input->AtEOF() ) {
1058 return false;
1059 }
1060 m_CurLine = *++*m_Input;
1061 _ASSERT(m_CurLine.HasZeroAtEnd());
1062 return true;
1063 }
1064
1065
x_UngetLine(void)1066 inline void CWig2tableApplication::x_UngetLine(void)
1067 {
1068 m_Input->UngetLine();
1069 }
1070
1071
DumpAnnot(void)1072 void CWig2tableApplication::DumpAnnot(void)
1073 {
1074 if ( !m_Annot ) {
1075 return;
1076 }
1077 if ( m_AsGraph ) {
1078 LOG_POST("DumpAnnot");
1079 }
1080 *m_Output << *m_Annot;
1081 m_Annot = null;
1082 }
1083
1084
DumpChromValues(void)1085 void CWig2tableApplication::DumpChromValues(void)
1086 {
1087 if ( m_ChromId.empty() ) {
1088 return;
1089 }
1090 LOG_POST("Chrom: "<<m_ChromId<<" "<<m_Values.size());
1091 if ( !m_Annot ) {
1092 m_Annot = MakeAnnot();
1093 }
1094 if ( m_AsGraph ) {
1095 m_Annot->SetData().SetGraph().push_back(MakeGraph());
1096 }
1097 else {
1098 m_Annot->SetData().SetSeq_table(*MakeTable());
1099 }
1100 if ( !m_SingleAnnot ) {
1101 DumpAnnot();
1102 }
1103 ResetChromValues();
1104 }
1105
1106
SetChrom(CTempString chrom)1107 void CWig2tableApplication::SetChrom(CTempString chrom)
1108 {
1109 if ( chrom != m_ChromId ) {
1110 DumpChromValues();
1111 m_ChromId = chrom;
1112 }
1113 }
1114
1115
ReadBrowser(void)1116 void CWig2tableApplication::ReadBrowser(void)
1117 {
1118 }
1119
1120
ReadTrack(void)1121 void CWig2tableApplication::ReadTrack(void)
1122 {
1123 DumpAnnot();
1124
1125 m_TrackDescription.clear();
1126 m_TrackTypeValue.clear();
1127 m_TrackType = eTrackType_invalid;
1128 m_TrackParams.clear();
1129 while ( x_SkipWS() ) {
1130 CTempString name = x_GetParamName();
1131 CTempString value = x_GetParamValue();
1132 if ( name == "type" ) {
1133 m_TrackTypeValue = value;
1134 if ( value == "wiggle_0" ) {
1135 m_TrackType = eTrackType_wiggle_0;
1136 }
1137 else if ( value == "bedGraph" ) {
1138 m_TrackType = eTrackType_bedGraph;
1139 }
1140 else {
1141 x_Error("Invalid track type");
1142 }
1143 }
1144 else if ( name == "name" ) {
1145 m_TrackName = value;
1146 }
1147 else if ( name == "description" ) {
1148 m_TrackDescription = value;
1149 }
1150 else {
1151 m_TrackParams[name] = value;
1152 }
1153 }
1154 if ( m_TrackType == eTrackType_invalid ) {
1155 x_Error("Unknown track type");
1156 }
1157 }
1158
1159
ReadFixedStep(void)1160 void CWig2tableApplication::ReadFixedStep(void)
1161 {
1162 if ( m_TrackType != eTrackType_wiggle_0 &&
1163 m_TrackType != eTrackType_invalid ) {
1164 x_Error("Track type=wiggle_0 is required");
1165 }
1166
1167 size_t start = 0;
1168 size_t step = 0;
1169 size_t span = 1;
1170 while ( x_SkipWS() ) {
1171 CTempString name = x_GetParamName();
1172 CTempString value = x_GetParamValue();
1173 if ( name == "chrom" ) {
1174 SetChrom(value);
1175 }
1176 else if ( name == "start" ) {
1177 start = NStr::StringToUInt(value);
1178 }
1179 else if ( name == "step" ) {
1180 step = NStr::StringToUInt(value);
1181 }
1182 else if ( name == "span" ) {
1183 span = NStr::StringToUInt(value);
1184 }
1185 else {
1186 x_Error("Bad param name");
1187 }
1188 }
1189 if ( m_ChromId.empty() ) {
1190 x_Error("No chrom");
1191 }
1192 if ( start == 0 ) {
1193 x_Error("No start");
1194 }
1195 if ( step == 0 ) {
1196 x_Error("No step");
1197 }
1198
1199 SValueInfo value;
1200 value.m_Pos = start-1;
1201 value.m_Span = span;
1202 while ( x_GetLine() ) {
1203 if ( !x_TryGetDouble(value.m_Value) ) {
1204 x_UngetLine();
1205 break;
1206 }
1207 AddValue(value);
1208 value.m_Pos += step;
1209 }
1210 }
1211
1212
ReadVariableStep(void)1213 void CWig2tableApplication::ReadVariableStep(void)
1214 {
1215 if ( m_TrackType != eTrackType_wiggle_0 &&
1216 m_TrackType != eTrackType_invalid ) {
1217 x_Error("Track type=wiggle_0 is required");
1218 }
1219
1220 size_t span = 1;
1221 while ( x_SkipWS() ) {
1222 CTempString name = x_GetParamName();
1223 CTempString value = x_GetParamValue();
1224 if ( name == "chrom" ) {
1225 SetChrom(value);
1226 }
1227 else if ( name == "span" ) {
1228 span = NStr::StringToUInt(value);
1229 }
1230 else {
1231 x_Error("Bad param name");
1232 }
1233 }
1234 if ( m_ChromId.empty() ) {
1235 x_Error("No chrom");
1236 }
1237 SValueInfo value;
1238 value.m_Span = span;
1239 while ( x_GetLine() ) {
1240 if ( !x_TryGetPos(value.m_Pos) ) {
1241 x_UngetLine();
1242 break;
1243 }
1244 x_SkipWS();
1245 x_GetDouble(value.m_Value);
1246 value.m_Pos -= 1;
1247 AddValue(value);
1248 }
1249 }
1250
1251
ReadBedLine(CTempString chrom)1252 void CWig2tableApplication::ReadBedLine(CTempString chrom)
1253 {
1254 if ( m_TrackType != eTrackType_bedGraph &&
1255 m_TrackType != eTrackType_invalid ) {
1256 x_Error("Track type=bedGraph is required");
1257 }
1258 SetChrom(chrom);
1259 SValueInfo value;
1260 x_SkipWS();
1261 x_GetPos(value.m_Pos);
1262 x_SkipWS();
1263 x_GetPos(value.m_Span);
1264 x_SkipWS();
1265 x_GetDouble(value.m_Value);
1266 value.m_Span -= value.m_Pos;
1267 AddValue(value);
1268 }
1269
1270
Run(void)1271 int CWig2tableApplication::Run(void)
1272 {
1273 SetDiagPostLevel(eDiag_Warning);
1274 // Get arguments
1275 const CArgs& args = GetArgs();
1276
1277 if ( args["mapfile"] ) {
1278 m_IdMapper.reset(new CIdMapperConfig(args["mapfile"].AsInputFile(),
1279 args["genome"].AsString(),
1280 false));
1281 }
1282 else if ( !args["genome"].AsString().empty() ) {
1283 LOG_POST("Genome: "<<args["genome"].AsString());
1284 m_IdMapper.reset(new CIdMapperBuiltin(args["genome"].AsString(),
1285 false));
1286 }
1287
1288 m_AsGraph = args["as-graph"];
1289 m_SingleAnnot = args["single-annot"];
1290 m_OmitZeros = args["omit-zeros"];
1291 m_JoinSame = args["join-same"];
1292 m_AsByte = m_AsGraph || args["as-byte"];
1293 m_KeepInteger = args["keep-integer"];
1294 m_GapValue = args["gap-value"].AsDouble();
1295 if ( m_AsGraph ) {
1296 if ( m_OmitZeros ) {
1297 ERR_POST(Warning<<
1298 "The option -omit_zeros is ignored for Seq-graph");
1299 m_OmitZeros = false;
1300 }
1301 if ( m_JoinSame ) {
1302 ERR_POST(Warning<<
1303 "The option -join_same is ignored for Seq-graph");
1304 m_JoinSame = false;
1305 }
1306 }
1307 else {
1308 if ( m_SingleAnnot ) {
1309 ERR_POST(Warning<<
1310 "The option -single-annot is ignored for Seq-table");
1311 m_SingleAnnot = false;
1312 }
1313 }
1314
1315 // Read the entry
1316 m_Input.reset(new CWigBufferedLineReader(args["input"].AsString()));
1317 m_Output.reset
1318 (CObjectOStream::Open(s_GetFormat(args["outfmt"].AsString()),
1319 args["output"].AsOutputFile()));
1320
1321 if ( args["name"] ) {
1322 m_TrackName = args["name"].AsString();
1323 }
1324 else if ( m_TrackName.empty() ) {
1325 m_TrackName = "User Track";
1326 }
1327
1328 while ( x_GetLine() ) {
1329 if ( x_CommentLine() ) {
1330 continue;
1331 }
1332 CTempString s = x_GetWord();
1333 if ( s == "browser" ) {
1334 ReadBrowser();
1335 }
1336 else if ( s == "track" ) {
1337 ReadTrack();
1338 }
1339 else if ( s == "fixedStep" ) {
1340 ReadFixedStep();
1341 }
1342 else if ( s == "variableStep" ) {
1343 ReadVariableStep();
1344 }
1345 else {
1346 ReadBedLine(s);
1347 }
1348 }
1349 m_Input.reset();
1350
1351 DumpChromValues();
1352 DumpAnnot();
1353 m_Output.reset();
1354
1355 // Exit successfully
1356 return 0;
1357 }
1358
1359
1360 /////////////////////////////////////////////////////////////////////////////
1361 // MAIN
1362
1363
main(int argc,const char * argv[])1364 int main(int argc, const char* argv[])
1365 {
1366 // Execute main application function
1367 return CWig2tableApplication().AppMain(argc, argv);
1368 }
1369