1 /*  $Id: wig2table.cpp 637992 2021-09-21 18:48:18Z ivanov $
2 * ===========================================================================
3 *
4 *                            PUBLIC DOMAIN NOTICE
5 *               National Center for Biotechnology Information
6 *
7 *  This software/database is a "United States Government Work" under the
8 *  terms of the United States Copyright Act.  It was written as part of
9 *  the author's official duties as a United States Government employee and
10 *  thus cannot be copyrighted.  This software/database is freely available
11 *  to the public for use. The National Library of Medicine and the U.S.
12 *  Government have not placed any restriction on its use or reproduction.
13 *
14 *  Although all reasonable efforts have been taken to ensure the accuracy
15 *  and reliability of the software and data, the NLM and the U.S.
16 *  Government do not and cannot warrant the performance or results that
17 *  may be obtained by using this software or data. The NLM and the U.S.
18 *  Government disclaim all warranties, express or implied, including
19 *  warranties of performance, merchantability or fitness for any particular
20 *  purpose.
21 *
22 *  Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author:  Aaron Ucko
27 *
28 * File Description:
29 *   Simple program demonstrating the use of serializable objects (in this
30 *   case, biological sequences).  Does NOT use the object manager.
31 *
32 * ===========================================================================
33 */
34 
35 #include <ncbi_pch.hpp>
36 #include <corelib/ncbiapp.hpp>
37 #include <corelib/ncbiargs.hpp>
38 #include <corelib/ncbienv.hpp>
39 
40 #include <serial/iterator.hpp>
41 #include <serial/objistr.hpp>
42 #include <serial/objostr.hpp>
43 #include <serial/serial.hpp>
44 
45 #include <objects/general/Object_id.hpp>
46 #include <objects/general/User_object.hpp>
47 #include <objects/general/User_field.hpp>
48 #include <objects/seqloc/seqloc__.hpp>
49 #include <objects/seqtable/seqtable__.hpp>
50 #include <objects/seqres/seqres__.hpp>
51 #include <objects/seq/Seq_annot.hpp>
52 #include <objects/seq/Annot_descr.hpp>
53 #include <objects/seq/Annotdesc.hpp>
54 #include <objtools/readers/idmapper.hpp>
55 #include <corelib/ncbifile.hpp>
56 
57 USING_SCOPE(ncbi);
58 USING_SCOPE(objects);
59 
60 class CWigBufferedLineReader
61 {
62 public:
63     /// read from the file, "-" (but not "./-") means standard input
64     CWigBufferedLineReader(const string& filename);
65     virtual ~CWigBufferedLineReader();
66 
67     bool                AtEOF(void) const;
68     CWigBufferedLineReader& operator++(void);
69     void                UngetLine(void);
70     CTempStringEx       operator*(void) const;
71     CT_POS_TYPE         GetPosition(void) const;
72     Uint8               GetLineNumber(void) const;
73 
74 protected:
75 
76     void x_LoadLong(void);
77     bool x_ReadBuffer(void);
78 
79 private:
80     AutoPtr<IReader> m_Reader;
81     bool          m_Eof;
82     bool          m_UngetLine;
83     size_t        m_BufferSize;
84     AutoArray<char> m_Buffer;
85     char*         m_Pos;
86     char*         m_End;
87     CTempStringEx m_Line;
88     string        m_String;
89     CT_POS_TYPE   m_InputPos;
90     Uint8         m_LineNumber;
91 };
92 
93 
CWigBufferedLineReader(const string & filename)94 CWigBufferedLineReader::CWigBufferedLineReader(const string& filename)
95     : m_Reader(CFileReader::New(filename)),
96       m_Eof(false),
97       m_UngetLine(false),
98       m_BufferSize(32*1024),
99       m_Buffer(new char[m_BufferSize]),
100       m_Pos(m_Buffer.get()),
101       m_End(m_Pos),
102       m_InputPos(0),
103       m_LineNumber(0)
104 {
105     x_ReadBuffer();
106 }
107 
108 
~CWigBufferedLineReader()109 CWigBufferedLineReader::~CWigBufferedLineReader()
110 {
111 }
112 
113 
AtEOF(void) const114 bool CWigBufferedLineReader::AtEOF(void) const
115 {
116     return m_Eof && !m_UngetLine;
117 }
118 
119 
UngetLine(void)120 void CWigBufferedLineReader::UngetLine(void)
121 {
122     _ASSERT(!m_UngetLine);
123     _ASSERT(m_Line.begin());
124     --m_LineNumber;
125     m_UngetLine = true;
126 }
127 
128 
operator ++(void)129 CWigBufferedLineReader& CWigBufferedLineReader::operator++(void)
130 {
131     ++m_LineNumber;
132     if ( m_UngetLine ) {
133         _ASSERT(m_Line.begin());
134         m_UngetLine = false;
135         return *this;
136     }
137     // check if we are at the buffer end
138     char* start = m_Pos;
139     char* end = m_End;
140     for ( char* p = start; p < end; ++p ) {
141         if ( *p == '\n' ) {
142             *p = '\0';
143             m_Line.assign(start, p - start, CTempStringEx::eHasZeroAtEnd);
144             m_Pos = ++p;
145             if ( p == end ) {
146                 m_String = m_Line;
147                 m_Line = m_String;
148                 x_ReadBuffer();
149             }
150             return *this;
151         }
152         else if ( *p == '\r' ) {
153             *p = '\0';
154             m_Line.assign(start, p - start, CTempStringEx::eHasZeroAtEnd);
155             if ( ++p == end ) {
156                 m_String = m_Line;
157                 m_Line = m_String;
158                 if ( x_ReadBuffer() ) {
159                     p = m_Pos;
160                     if ( *p == '\n' ) {
161                         m_Pos = p+1;
162                     }
163                 }
164                 return *this;
165             }
166             if ( *p != '\n' ) {
167                 return *this;
168             }
169             m_Pos = ++p;
170             if ( p == end ) {
171                 m_String = m_Line;
172                 m_Line = m_String;
173                 x_ReadBuffer();
174             }
175             return *this;
176         }
177     }
178     x_LoadLong();
179     return *this;
180 }
181 
182 
x_LoadLong(void)183 void CWigBufferedLineReader::x_LoadLong(void)
184 {
185     char* start = m_Pos;
186     char* end = m_End;
187     m_String.assign(start, end);
188     while ( x_ReadBuffer() ) {
189         start = m_Pos;
190         end = m_End;
191         for ( char* p = start; p < end; ++p ) {
192             char c = *p;
193             if ( c == '\r' || c == '\n' ) {
194                 m_String.append(start, p - start);
195                 m_Line = m_String;
196                 if ( ++p == end ) {
197                     m_String = m_Line;
198                     m_Line = m_String;
199                     if ( x_ReadBuffer() ) {
200                         p = m_Pos;
201                         end = m_End;
202                         if ( p < end && c == '\r' && *p == '\n' ) {
203                             ++p;
204                             m_Pos = p;
205                         }
206                     }
207                 }
208                 else {
209                     if ( c == '\r' && *p == '\n' ) {
210                         if ( ++p == end ) {
211                             x_ReadBuffer();
212                             p = m_Pos;
213                         }
214                     }
215                     m_Pos = p;
216                 }
217                 return;
218             }
219         }
220         m_String.append(start, end - start);
221     }
222     m_Line = m_String;
223     return;
224 }
225 
226 
x_ReadBuffer()227 bool CWigBufferedLineReader::x_ReadBuffer()
228 {
229     _ASSERT(m_Reader);
230 
231     if ( m_Eof ) {
232         return false;
233     }
234 
235     m_InputPos += CT_OFF_TYPE(m_End - m_Buffer.get());
236     m_Pos = m_End = m_Buffer.get();
237     for (bool flag = true; flag; ) {
238         size_t size;
239         ERW_Result result =
240             m_Reader->Read(m_Buffer.get(), m_BufferSize, &size);
241         switch (result) {
242         case eRW_NotImplemented:
243         case eRW_Error:
244             NCBI_THROW(CIOException, eRead, "Read error");
245             /*NOTREACHED*/
246             break;
247         case eRW_Timeout:
248             // keep spinning around
249             break;
250         case eRW_Eof:
251             m_Eof = true;
252             // fall through
253         case eRW_Success:
254             m_End = m_Pos + size;
255             return (result == eRW_Success  ||  size > 0);
256         default:
257             _ASSERT(0);
258         }
259     } // for
260     return false;
261 }
262 
263 
operator *(void) const264 CTempStringEx CWigBufferedLineReader::operator*(void) const
265 {
266     return m_Line;
267 }
268 
269 
GetPosition(void) const270 CT_POS_TYPE CWigBufferedLineReader::GetPosition(void) const
271 {
272     return m_InputPos + CT_OFF_TYPE(m_Pos - m_Buffer.get());
273 }
274 
275 
GetLineNumber(void) const276 Uint8 CWigBufferedLineReader::GetLineNumber(void) const
277 {
278     return m_LineNumber;
279 }
280 
281 
282 class CWig2tableApplication : public CNcbiApplication
283 {
284     virtual void Init(void);
285     virtual int  Run(void);
286 
287     CRef<CSeq_table> MakeTable(void);
288     CRef<CSeq_graph> MakeGraph(void);
289     CRef<CSeq_annot> MakeAnnot(void);
290     CRef<CSeq_annot> MakeTableAnnot(void);
291     CRef<CSeq_annot> MakeGraphAnnot(void);
292     void DumpAnnot(void);
293     void DumpChromValues(void);
294     void ResetChromValues(void);
295     void SetChrom(CTempString chrom);
296 
297     double EstimateSize(size_t rows, bool fixed_span) const;
298 
299     void ReadBrowser(void);
300     void ReadTrack(void);
301     void ReadFixedStep(void);
302     void ReadVariableStep(void);
303     void ReadBedLine(CTempString chrom);
304 
305     AutoPtr<CWigBufferedLineReader> m_Input;
306     AutoPtr<CObjectOStream> m_Output;
307 
308     CTempStringEx m_CurLine;
309 
310     bool x_GetLine(void);
311     void x_UngetLine(void);
312     bool x_SkipWS(void);
313     bool x_CommentLine(void) const;
314     CTempString x_GetWord(void);
315     CTempString x_GetParamName(void);
316     CTempString x_GetParamValue(void);
317     bool x_TryGetPos(TSeqPos& v);
318     bool x_TryGetDoubleSimple(double& v);
319     bool x_TryGetDouble(double& v);
320     void x_GetPos(TSeqPos& v);
321     void x_GetDouble(double& v);
322 
323     void x_Error(const char* msg);
324 
325     string m_ChromId;
326     struct SValueInfo {
327         TSeqPos m_Pos;
328         TSeqPos m_Span;
329         double m_Value;
330 
GetEndCWig2tableApplication::SValueInfo331         TSeqPos GetEnd(void) const {
332             return m_Pos + m_Span;
333         }
operator <CWig2tableApplication::SValueInfo334         bool operator<(const SValueInfo& v) const {
335             return m_Pos < v.m_Pos;
336         }
337     };
338 
339     struct SStat {
340         bool m_FixedSpan;
341         bool m_HaveGaps;
342         bool m_IntValues;
343         TSeqPos m_Span;
344         double m_Min, m_Max, m_Step, m_StepMul;
345 
SStatCWig2tableApplication::SStat346         SStat(void)
347             : m_FixedSpan(true),
348               m_HaveGaps(false),
349               m_IntValues(true),
350               m_Span(1),
351               m_Min(0),
352               m_Max(0),
353               m_Step(1),
354               m_StepMul(1)
355             {
356             }
SetFirstSpanCWig2tableApplication::SStat357         void SetFirstSpan(TSeqPos span)
358             {
359                 m_FixedSpan = true;
360                 m_Span = span;
361             }
AddSpanCWig2tableApplication::SStat362         void AddSpan(TSeqPos span)
363             {
364                 if ( span != m_Span ) {
365                     m_FixedSpan = false;
366                 }
367             }
SetFirstValueCWig2tableApplication::SStat368         void SetFirstValue(double v)
369             {
370                 m_Min = m_Max = v;
371                 m_IntValues = v == int(v);
372             }
AddValueCWig2tableApplication::SStat373         void AddValue(double v)
374             {
375                 if ( v < m_Min ) {
376                     m_Min = v;
377                 }
378                 if ( v > m_Max ) {
379                     m_Max = v;
380                 }
381                 if ( m_IntValues && v != int(v) ) {
382                     m_IntValues = false;
383                 }
384             }
AsByteCWig2tableApplication::SStat385         int AsByte(double v) const
386             {
387                 return int((v-m_Min)*m_StepMul+.5);
388             }
389     };
390     // sort values and return min and max values
391     SStat x_PreprocessValues(void);
392     CRef<CSeq_id> x_MakeChromId(void);
393     void x_SetTotalLoc(CSeq_loc& loc, CSeq_id& chrom_id);
394 
AddValue(const SValueInfo & value)395     void AddValue(const SValueInfo& value) {
396         if ( !m_OmitZeros || value.m_Value != 0 ) {
397             m_Values.push_back(value);
398         }
399     }
400 
401     string m_TrackName;
402     string m_TrackDescription;
403     string m_TrackTypeValue;
404     enum ETrackType {
405         eTrackType_invalid,
406         eTrackType_wiggle_0,
407         eTrackType_bedGraph
408     };
409     ETrackType m_TrackType;
410     typedef map<string, string> TTrackParams;
411     TTrackParams m_TrackParams;
412     typedef vector<SValueInfo> TValues;
413     TValues m_Values;
414     CRef<CSeq_annot> m_Annot;
415 
416     bool m_AsGraph;
417     bool m_SingleAnnot;
418     bool m_OmitZeros;
419     bool m_JoinSame;
420     bool m_AsByte;
421     bool m_KeepInteger;
422     double m_GapValue;
423     AutoPtr<IIdMapper> m_IdMapper;
424 };
425 
426 
Init(void)427 void CWig2tableApplication::Init(void)
428 {
429     // Create command-line argument descriptions class
430     auto_ptr<CArgDescriptions> arg_desc(new CArgDescriptions);
431 
432     // Specify USAGE context
433     arg_desc->SetUsageContext
434         (GetArguments().GetProgramBasename(),
435          "Object serialization demo program: Seq-entry translator");
436 
437     // Describe the expected command-line arguments
438     arg_desc->AddDefaultKey
439         ("input", "InputFile",
440          "name of file to read from (standard input by default)",
441          CArgDescriptions::eInputFile, "-", CArgDescriptions::fPreOpen);
442 
443     arg_desc->AddDefaultKey
444         ("output", "OutputFile",
445          "name of file to write to (standard output by default)",
446          CArgDescriptions::eOutputFile, "-");
447     arg_desc->AddDefaultKey("outfmt", "OutputFormat", "format of output file",
448                             CArgDescriptions::eString, "asn");
449     arg_desc->SetConstraint("outfmt",
450                             &(*new CArgAllow_Strings, "asn", "asnb", "xml"));
451 
452     arg_desc->AddOptionalKey("mapfile", "MapFile",
453                              "IdMapper config filename",
454                              CArgDescriptions::eInputFile);
455     arg_desc->AddDefaultKey("genome", "Genome",
456                             "UCSC build number",
457                             CArgDescriptions::eString, "");
458 
459     arg_desc->AddFlag("as-graph",
460                       "Generate Seq-graph");
461     arg_desc->AddFlag("single-annot",
462                       "Put all Seq-graphs in a single Seq-annot");
463     arg_desc->AddFlag("as-byte",
464                       "Convert data in byte range");
465     arg_desc->AddFlag("omit-zeros",
466                       "Omit zero values");
467     arg_desc->AddFlag("join-same",
468                       "Join equal sequential values");
469     arg_desc->AddFlag("keep-integer",
470                       "Keep integer as is if they fit in an output range");
471     arg_desc->AddDefaultKey("gap-value", "GapValue",
472                             "Assumed value in gaps",
473                             CArgDescriptions::eDouble, "0");
474 
475     arg_desc->AddOptionalKey("name", "Name",
476                              "Track name or graph title",
477                              CArgDescriptions::eString);
478 
479     // Setup arg.descriptions for this application
480     SetupArgDescriptions(arg_desc.release());
481 }
482 
483 
s_GetFormat(const string & name)484 static ESerialDataFormat s_GetFormat(const string& name)
485 {
486     if (name == "asn") {
487         return eSerial_AsnText;
488     } else if (name == "asnb") {
489         return eSerial_AsnBinary;
490     } else if (name == "xml") {
491         return eSerial_Xml;
492     } else {
493         // Should be caught by argument processing, but as an illustration...
494         THROW1_TRACE(runtime_error, "Bad serial format name " + name);
495     }
496 }
497 
498 
EstimateSize(size_t rows,bool fixed_span) const499 double CWig2tableApplication::EstimateSize(size_t rows, bool fixed_span) const
500 {
501     double ret = 0;
502     ret += rows*4;
503     if ( !fixed_span )
504         ret += rows*4;
505     if ( m_AsByte )
506         ret += rows;
507     else
508         ret += 8*rows;
509     return ret;
510 }
511 
512 
x_PreprocessValues(void)513 CWig2tableApplication::SStat CWig2tableApplication::x_PreprocessValues(void)
514 {
515     SStat stat;
516     bool sorted = true;
517     size_t size = m_Values.size();
518     if ( size ) {
519         stat.SetFirstSpan(m_Values[0].m_Span);
520         stat.SetFirstValue(m_Values[0].m_Value);
521 
522         for ( size_t i = 1; i < size; ++i ) {
523             stat.AddSpan(m_Values[i].m_Span);
524             stat.AddValue(m_Values[i].m_Value);
525             if ( sorted ) {
526                 if ( m_Values[i].m_Pos < m_Values[i-1].m_Pos ) {
527                     sorted = false;
528                 }
529                 if ( m_Values[i].m_Pos != m_Values[i-1].GetEnd() ) {
530                     stat.m_HaveGaps = true;
531                 }
532             }
533         }
534     }
535     if ( !sorted ) {
536         sort(m_Values.begin(), m_Values.end());
537         stat.m_HaveGaps = false;
538         for ( size_t i = 1; i < size; ++i ) {
539             if ( m_Values[i].m_Pos != m_Values[i-1].GetEnd() ) {
540                 stat.m_HaveGaps = true;
541                 break;
542             }
543         }
544     }
545     if ( m_AsGraph && stat.m_HaveGaps ) {
546         stat.AddValue(m_GapValue);
547     }
548 
549     const int range = 255;
550     if ( stat.m_Max > stat.m_Min &&
551          (!m_KeepInteger ||
552           !stat.m_IntValues ||
553           stat.m_Max-stat.m_Min > range) ) {
554         stat.m_Step = (stat.m_Max-stat.m_Min)/range;
555         stat.m_StepMul = 1/stat.m_Step;
556     }
557 
558     if ( !m_AsGraph && m_JoinSame && size ) {
559         TValues nv;
560         nv.reserve(size);
561         nv.push_back(m_Values[0]);
562         for ( size_t i = 1; i < size; ++i ) {
563             if ( m_Values[i].m_Pos == nv.back().GetEnd() &&
564                  m_Values[i].m_Value == nv.back().m_Value ) {
565                 nv.back().m_Span += m_Values[i].m_Span;
566             }
567             else {
568                 nv.push_back(m_Values[i]);
569             }
570         }
571         if ( nv.size() != size ) {
572             double s = EstimateSize(size, stat.m_FixedSpan);
573             double ns = EstimateSize(nv.size(), false);
574             if ( ns < s*.75 ) {
575                 m_Values.swap(nv);
576                 size = m_Values.size();
577                 LOG_POST("Joined size: "<<size);
578                 stat.m_FixedSpan = false;
579             }
580         }
581     }
582 
583     if ( m_AsGraph && !stat.m_FixedSpan ) {
584         stat.m_Span = 1;
585         stat.m_FixedSpan = true;
586     }
587 
588     return stat;
589 }
590 
591 
x_MakeChromId(void)592 CRef<CSeq_id> CWig2tableApplication::x_MakeChromId(void)
593 {
594     CRef<CSeq_id> chrom_id(new CSeq_id(CSeq_id::e_Local, m_ChromId));
595     if ( m_IdMapper ) {
596         m_IdMapper->MapObject(*chrom_id);
597     }
598     return chrom_id;
599 }
600 
601 
x_SetTotalLoc(CSeq_loc & loc,CSeq_id & chrom_id)602 void CWig2tableApplication::x_SetTotalLoc(CSeq_loc& loc, CSeq_id& chrom_id)
603 {
604     if ( m_Values.empty() ) {
605         loc.SetEmpty(chrom_id);
606     }
607     else {
608         CSeq_interval& interval = loc.SetInt();
609         interval.SetId(chrom_id);
610         interval.SetFrom(m_Values.front().m_Pos);
611         interval.SetTo(m_Values.back().GetEnd()-1);
612     }
613 }
614 
615 
MakeTable(void)616 CRef<CSeq_table> CWig2tableApplication::MakeTable(void)
617 {
618     CRef<CSeq_table> table(new CSeq_table);
619 
620     table->SetFeat_type(0);
621 
622     CRef<CSeq_id> chrom_id = x_MakeChromId();
623 
624     CRef<CSeq_loc> table_loc(new CSeq_loc);
625     { // Seq-table location
626         CRef<CSeqTable_column> col_id(new CSeqTable_column);
627         table->SetColumns().push_back(col_id);
628         col_id->SetHeader().SetField_name("Seq-table location");
629         col_id->SetDefault().SetLoc(*table_loc);
630     }
631 
632     { // Seq-id
633         CRef<CSeqTable_column> col_id(new CSeqTable_column);
634         table->SetColumns().push_back(col_id);
635         col_id->SetHeader().SetField_id(CSeqTable_column_info::eField_id_location_id);
636         col_id->SetDefault().SetId(*chrom_id);
637     }
638 
639     // position
640     CRef<CSeqTable_column> col_pos(new CSeqTable_column);
641     table->SetColumns().push_back(col_pos);
642     col_pos->SetHeader().SetField_id(CSeqTable_column_info::eField_id_location_from);
643     CSeqTable_multi_data::TInt& pos = col_pos->SetData().SetInt();
644 
645     SStat stat = x_PreprocessValues();
646 
647     x_SetTotalLoc(*table_loc, *chrom_id);
648 
649     size_t size = m_Values.size();
650     table->SetNum_rows(size);
651     pos.reserve(size);
652 
653     CSeqTable_multi_data::TInt* span_ptr = 0;
654     { // span
655         CRef<CSeqTable_column> col_span(new CSeqTable_column);
656         table->SetColumns().push_back(col_span);
657         col_span->SetHeader().SetField_name("span");
658         if ( stat.m_FixedSpan ) {
659             col_span->SetDefault().SetInt(stat.m_Span);
660         }
661         else {
662             span_ptr = &col_span->SetData().SetInt();
663             span_ptr->reserve(size);
664         }
665     }
666 
667     if ( stat.m_HaveGaps ) {
668         CRef<CSeqTable_column> col_step(new CSeqTable_column);
669         table->SetColumns().push_back(col_step);
670         col_step->SetHeader().SetField_name("value_gap");
671         col_step->SetDefault().SetReal(m_GapValue);
672     }
673 
674     if ( m_AsByte ) { // values
675         CRef<CSeqTable_column> col_min(new CSeqTable_column);
676         table->SetColumns().push_back(col_min);
677         col_min->SetHeader().SetField_name("value_min");
678         col_min->SetDefault().SetReal(stat.m_Min);
679 
680         CRef<CSeqTable_column> col_step(new CSeqTable_column);
681         table->SetColumns().push_back(col_step);
682         col_step->SetHeader().SetField_name("value_step");
683         col_step->SetDefault().SetReal(stat.m_Step);
684 
685         CRef<CSeqTable_column> col_val(new CSeqTable_column);
686         table->SetColumns().push_back(col_val);
687         col_val->SetHeader().SetField_name("values");
688 
689         if ( 1 ) {
690             AutoPtr< vector<char> > values(new vector<char>());
691             values->reserve(size);
692             ITERATE ( TValues, it, m_Values ) {
693                 pos.push_back(it->m_Pos);
694                 if ( span_ptr ) {
695                     span_ptr->push_back(it->m_Span);
696                 }
697                 values->push_back(stat.AsByte(it->m_Value));
698             }
699             col_val->SetData().SetBytes().push_back(values.release());
700         }
701         else {
702             CSeqTable_multi_data::TInt& values = col_val->SetData().SetInt();
703             values.reserve(size);
704 
705             ITERATE ( TValues, it, m_Values ) {
706                 pos.push_back(it->m_Pos);
707                 if ( span_ptr ) {
708                     span_ptr->push_back(it->m_Span);
709                 }
710                 values.push_back(stat.AsByte(it->m_Value));
711             }
712         }
713     }
714     else {
715         CRef<CSeqTable_column> col_val(new CSeqTable_column);
716         table->SetColumns().push_back(col_val);
717         col_val->SetHeader().SetField_name("values");
718         CSeqTable_multi_data::TReal& values = col_val->SetData().SetReal();
719         values.reserve(size);
720 
721         ITERATE ( TValues, it, m_Values ) {
722             pos.push_back(it->m_Pos);
723             if ( span_ptr ) {
724                 span_ptr->push_back(it->m_Span);
725             }
726             values.push_back(it->m_Value);
727         }
728     }
729     return table;
730 }
731 
732 
MakeGraph(void)733 CRef<CSeq_graph> CWig2tableApplication::MakeGraph(void)
734 {
735     CRef<CSeq_graph> graph(new CSeq_graph);
736 
737     CRef<CSeq_id> chrom_id = x_MakeChromId();
738 
739     CRef<CSeq_loc> graph_loc(new CSeq_loc);
740     graph->SetLoc(*graph_loc);
741 
742     SStat stat = x_PreprocessValues();
743 
744     x_SetTotalLoc(*graph_loc, *chrom_id);
745 
746     if ( !m_TrackName.empty() ) {
747         graph->SetTitle(m_TrackName);
748     }
749     graph->SetComp(stat.m_Span);
750     graph->SetA(stat.m_Step);
751     graph->SetB(stat.m_Min);
752 
753     CByte_graph& b_graph = graph->SetGraph().SetByte();
754     b_graph.SetMin(stat.AsByte(stat.m_Min));
755     b_graph.SetMax(stat.AsByte(stat.m_Max));
756     b_graph.SetAxis(0);
757     vector<char>& bytes = b_graph.SetValues();
758 
759     if ( m_Values.empty() ) {
760         graph->SetNumval(0);
761     }
762     else {
763         _ASSERT(stat.m_FixedSpan);
764         TSeqPos start = m_Values[0].m_Pos;
765         TSeqPos end = m_Values.back().GetEnd();
766         size_t size = (end-start)/stat.m_Span;
767         graph->SetNumval(size);
768         bytes.resize(size, stat.AsByte(m_GapValue));
769         ITERATE ( TValues, it, m_Values ) {
770             TSeqPos pos = it->m_Pos - start;
771             TSeqPos span = it->m_Span;
772             _ASSERT(pos % stat.m_Span == 0);
773             _ASSERT(span % stat.m_Span == 0);
774             size_t i = pos / stat.m_Span;
775             int v = stat.AsByte(it->m_Value);
776             for ( ; span > 0; span -= stat.m_Span, ++i ) {
777                 bytes[i] = v;
778             }
779         }
780     }
781     return graph;
782 }
783 
784 
MakeAnnot(void)785 CRef<CSeq_annot> CWig2tableApplication::MakeAnnot(void)
786 {
787     CRef<CSeq_annot> annot(new CSeq_annot);
788     if ( !m_TrackDescription.empty() ) {
789         CRef<CAnnotdesc> desc(new CAnnotdesc);
790         desc->SetTitle(m_TrackDescription);
791         annot->SetDesc().Set().push_back(desc);
792     }
793     if ( !m_TrackName.empty() ) {
794         CRef<CAnnotdesc> desc(new CAnnotdesc);
795         desc->SetName(m_TrackName);
796         annot->SetDesc().Set().push_back(desc);
797     }
798     if ( true ) {
799         m_TrackParams["track type"] = "graph";
800     }
801     if ( !m_TrackParams.empty() ) {
802         CRef<CAnnotdesc> desc(new CAnnotdesc);
803         annot->SetDesc().Set().push_back(desc);
804         CUser_object& user = desc->SetUser();
805         user.SetType().SetStr("Track Data");
806         ITERATE ( TTrackParams, it, m_TrackParams ) {
807             CRef<CUser_field> field(new CUser_field);
808             field->SetLabel().SetStr(it->first);
809             field->SetData().SetStr(it->second);
810             user.SetData().push_back(field);
811         }
812     }
813     return annot;
814 }
815 
816 
MakeTableAnnot(void)817 CRef<CSeq_annot> CWig2tableApplication::MakeTableAnnot(void)
818 {
819     CRef<CSeq_annot> annot = MakeAnnot();
820     annot->SetData().SetSeq_table(*MakeTable());
821     return annot;
822 }
823 
824 
MakeGraphAnnot(void)825 CRef<CSeq_annot> CWig2tableApplication::MakeGraphAnnot(void)
826 {
827     CRef<CSeq_annot> annot = MakeAnnot();
828     annot->SetData().SetGraph().push_back(MakeGraph());
829     return annot;
830 }
831 
832 
ResetChromValues(void)833 void CWig2tableApplication::ResetChromValues(void)
834 {
835     m_ChromId.clear();
836     m_Values.clear();
837 }
838 
839 
x_Error(const char * msg)840 void CWig2tableApplication::x_Error(const char* msg)
841 {
842     ERR_FATAL(GetArgs()["input"].AsString()<<":"<<m_Input->GetLineNumber()<<
843               ": "<<msg<<": \""<<m_CurLine<<"\"");
844 }
845 
846 
x_SkipWS(void)847 bool CWig2tableApplication::x_SkipWS(void)
848 {
849     const char* ptr = m_CurLine.data();
850     size_t skip = 0;
851     for ( size_t len = m_CurLine.size(); skip < len; ++skip ) {
852         char c = ptr[skip];
853         if ( c != ' ' && c != '\t' ) {
854             break;
855         }
856     }
857     m_CurLine = m_CurLine.substr(skip);
858     _ASSERT(m_CurLine.HasZeroAtEnd());
859     return !m_CurLine.empty();
860 }
861 
862 
x_CommentLine(void) const863 inline bool CWig2tableApplication::x_CommentLine(void) const
864 {
865     char c = m_CurLine.data()[0];
866     return c == '#';// || c == '\0';
867 }
868 
869 
x_GetWord(void)870 CTempString CWig2tableApplication::x_GetWord(void)
871 {
872     const char* ptr = m_CurLine.data();
873     size_t skip = 0;
874     for ( size_t len = m_CurLine.size(); skip < len; ++skip ) {
875         char c = ptr[skip];
876         if ( c == ' ' || c == '\t' ) {
877             break;
878         }
879     }
880     if ( skip == 0 ) {
881         x_Error("Identifier expected");
882     }
883     m_CurLine = m_CurLine.substr(skip);
884     _ASSERT(m_CurLine.HasZeroAtEnd());
885     return CTempString(ptr, skip);
886 }
887 
888 
x_GetParamName(void)889 CTempString CWig2tableApplication::x_GetParamName(void)
890 {
891     const char* ptr = m_CurLine.data();
892     size_t skip = 0;
893     for ( size_t len = m_CurLine.size(); skip < len; ++skip ) {
894         char c = ptr[skip];
895         if ( c == '=' ) {
896             m_CurLine = m_CurLine.substr(skip+1);
897             _ASSERT(m_CurLine.HasZeroAtEnd());
898             return CTempString(ptr, skip);
899         }
900         if ( c == ' ' || c == '\t' ) {
901             break;
902         }
903     }
904     x_Error("'=' expected");
905     return CTempString();
906 }
907 
908 
x_GetParamValue(void)909 CTempString CWig2tableApplication::x_GetParamValue(void)
910 {
911     const char* ptr = m_CurLine.data();
912     size_t len = m_CurLine.size();
913     if ( len && *ptr == '"' ) {
914         size_t pos = 1;
915         for ( ; pos < len; ++pos ) {
916             char c = ptr[pos];
917             if ( c == '"' ) {
918                 m_CurLine = m_CurLine.substr(pos+1);
919                 _ASSERT(m_CurLine.HasZeroAtEnd());
920                 return CTempString(ptr+1, pos-1);
921             }
922         }
923         x_Error("Open quotes");
924     }
925     return x_GetWord();
926 }
927 
928 
x_GetPos(TSeqPos & v)929 void CWig2tableApplication::x_GetPos(TSeqPos& v)
930 {
931     TSeqPos ret = 0;
932     const char* ptr = m_CurLine.data();
933     for ( size_t skip = 0; ; ++skip ) {
934         char c = ptr[skip];
935         if ( c >= '0' && c <= '9' ) {
936             ret = ret*10 + (c-'0');
937         }
938         else if ( (c == ' ' || c == '\t' || c == '\0') && skip ) {
939             m_CurLine = m_CurLine.substr(skip);
940             _ASSERT(m_CurLine.HasZeroAtEnd());
941             v = ret;
942             return;
943         }
944         else {
945             x_Error("Integer value expected");
946         }
947     }
948 }
949 
950 
x_TryGetDoubleSimple(double & v)951 bool CWig2tableApplication::x_TryGetDoubleSimple(double& v)
952 {
953     double ret = 0;
954     const char* ptr = m_CurLine.data();
955     size_t skip = 0;
956     bool negate = false, digits = false;
957     for ( ; ; ++skip ) {
958         char c = ptr[skip];
959         if ( !skip ) {
960             if ( c == '-' ) {
961                 negate = true;
962                 continue;
963             }
964             if ( c == '+' ) {
965                 continue;
966             }
967         }
968         if ( c >= '0' && c <= '9' ) {
969             digits = true;
970             ret = ret*10 + (c-'0');
971         }
972         else if ( c == '.' ) {
973             ++skip;
974             break;
975         }
976         else if ( c == '\0' ) {
977             if ( !digits ) {
978                 return false;
979             }
980             m_CurLine.clear();
981             _ASSERT(m_CurLine.HasZeroAtEnd());
982             if ( negate ) {
983                 ret = -ret;
984             }
985             v = ret;
986             return true;
987         }
988         else {
989             return false;
990         }
991     }
992     double digit_mul = 1;
993     for ( ; ; ++skip ) {
994         char c = ptr[skip];
995         if ( c >= '0' && c <= '9' ) {
996             digits = true;
997             digit_mul *= .1;
998             ret += (c-'0')*digit_mul;
999         }
1000         else if ( (c == ' ' || c == '\t' || c == '\0') && digits ) {
1001             m_CurLine.clear();
1002             _ASSERT(m_CurLine.HasZeroAtEnd());
1003             v = ret;
1004             if ( negate ) {
1005                 ret = -ret;
1006             }
1007             return true;
1008         }
1009         else {
1010             return false;
1011         }
1012     }
1013 }
1014 
1015 
x_TryGetDouble(double & v)1016 bool CWig2tableApplication::x_TryGetDouble(double& v)
1017 {
1018     if ( x_TryGetDoubleSimple(v) ) {
1019         return true;
1020     }
1021     const char* ptr = m_CurLine.data();
1022     char* endptr = 0;
1023     v = strtod(ptr, &endptr);
1024     if ( endptr == ptr ) {
1025         return false;
1026     }
1027     if ( *endptr ) {
1028         x_Error("extra text in line");
1029     }
1030     m_CurLine.clear();
1031     _ASSERT(m_CurLine.HasZeroAtEnd());
1032     return true;
1033 }
1034 
1035 
x_TryGetPos(TSeqPos & v)1036 inline bool CWig2tableApplication::x_TryGetPos(TSeqPos& v)
1037 {
1038     char c = m_CurLine.data()[0];
1039     if ( c < '0' || c > '9' ) {
1040         return false;
1041     }
1042     x_GetPos(v);
1043     return true;
1044 }
1045 
1046 
x_GetDouble(double & v)1047 inline void CWig2tableApplication::x_GetDouble(double& v)
1048 {
1049     if ( !x_TryGetDouble(v) ) {
1050         x_Error("Floating point value expected");
1051     }
1052 }
1053 
1054 
x_GetLine()1055 bool CWig2tableApplication::x_GetLine()
1056 {
1057     if ( m_Input->AtEOF() ) {
1058         return false;
1059     }
1060     m_CurLine = *++*m_Input;
1061     _ASSERT(m_CurLine.HasZeroAtEnd());
1062     return true;
1063 }
1064 
1065 
x_UngetLine(void)1066 inline void CWig2tableApplication::x_UngetLine(void)
1067 {
1068     m_Input->UngetLine();
1069 }
1070 
1071 
DumpAnnot(void)1072 void CWig2tableApplication::DumpAnnot(void)
1073 {
1074     if ( !m_Annot ) {
1075         return;
1076     }
1077     if ( m_AsGraph ) {
1078         LOG_POST("DumpAnnot");
1079     }
1080     *m_Output << *m_Annot;
1081     m_Annot = null;
1082 }
1083 
1084 
DumpChromValues(void)1085 void CWig2tableApplication::DumpChromValues(void)
1086 {
1087     if ( m_ChromId.empty() ) {
1088         return;
1089     }
1090     LOG_POST("Chrom: "<<m_ChromId<<" "<<m_Values.size());
1091     if ( !m_Annot ) {
1092         m_Annot = MakeAnnot();
1093     }
1094     if ( m_AsGraph ) {
1095         m_Annot->SetData().SetGraph().push_back(MakeGraph());
1096     }
1097     else {
1098         m_Annot->SetData().SetSeq_table(*MakeTable());
1099     }
1100     if ( !m_SingleAnnot ) {
1101         DumpAnnot();
1102     }
1103     ResetChromValues();
1104 }
1105 
1106 
SetChrom(CTempString chrom)1107 void CWig2tableApplication::SetChrom(CTempString chrom)
1108 {
1109     if ( chrom != m_ChromId ) {
1110         DumpChromValues();
1111         m_ChromId = chrom;
1112     }
1113 }
1114 
1115 
ReadBrowser(void)1116 void CWig2tableApplication::ReadBrowser(void)
1117 {
1118 }
1119 
1120 
ReadTrack(void)1121 void CWig2tableApplication::ReadTrack(void)
1122 {
1123     DumpAnnot();
1124 
1125     m_TrackDescription.clear();
1126     m_TrackTypeValue.clear();
1127     m_TrackType = eTrackType_invalid;
1128     m_TrackParams.clear();
1129     while ( x_SkipWS() ) {
1130         CTempString name = x_GetParamName();
1131         CTempString value = x_GetParamValue();
1132         if ( name == "type" ) {
1133             m_TrackTypeValue = value;
1134             if ( value == "wiggle_0" ) {
1135                 m_TrackType = eTrackType_wiggle_0;
1136             }
1137             else if ( value == "bedGraph" ) {
1138                 m_TrackType = eTrackType_bedGraph;
1139             }
1140             else {
1141                 x_Error("Invalid track type");
1142             }
1143         }
1144         else if ( name == "name" ) {
1145             m_TrackName = value;
1146         }
1147         else if ( name == "description" ) {
1148             m_TrackDescription = value;
1149         }
1150         else {
1151             m_TrackParams[name] = value;
1152         }
1153     }
1154     if ( m_TrackType == eTrackType_invalid ) {
1155         x_Error("Unknown track type");
1156     }
1157 }
1158 
1159 
ReadFixedStep(void)1160 void CWig2tableApplication::ReadFixedStep(void)
1161 {
1162     if ( m_TrackType != eTrackType_wiggle_0 &&
1163          m_TrackType != eTrackType_invalid ) {
1164         x_Error("Track type=wiggle_0 is required");
1165     }
1166 
1167     size_t start = 0;
1168     size_t step = 0;
1169     size_t span = 1;
1170     while ( x_SkipWS() ) {
1171         CTempString name = x_GetParamName();
1172         CTempString value = x_GetParamValue();
1173         if ( name == "chrom" ) {
1174             SetChrom(value);
1175         }
1176         else if ( name == "start" ) {
1177             start = NStr::StringToUInt(value);
1178         }
1179         else if ( name == "step" ) {
1180             step = NStr::StringToUInt(value);
1181         }
1182         else if ( name == "span" ) {
1183             span = NStr::StringToUInt(value);
1184         }
1185         else {
1186             x_Error("Bad param name");
1187         }
1188     }
1189     if ( m_ChromId.empty() ) {
1190         x_Error("No chrom");
1191     }
1192     if ( start == 0 ) {
1193         x_Error("No start");
1194     }
1195     if ( step == 0 ) {
1196         x_Error("No step");
1197     }
1198 
1199     SValueInfo value;
1200     value.m_Pos = start-1;
1201     value.m_Span = span;
1202     while ( x_GetLine() ) {
1203         if ( !x_TryGetDouble(value.m_Value) ) {
1204             x_UngetLine();
1205             break;
1206         }
1207         AddValue(value);
1208         value.m_Pos += step;
1209     }
1210 }
1211 
1212 
ReadVariableStep(void)1213 void CWig2tableApplication::ReadVariableStep(void)
1214 {
1215     if ( m_TrackType != eTrackType_wiggle_0 &&
1216          m_TrackType != eTrackType_invalid ) {
1217         x_Error("Track type=wiggle_0 is required");
1218     }
1219 
1220     size_t span = 1;
1221     while ( x_SkipWS() ) {
1222         CTempString name = x_GetParamName();
1223         CTempString value = x_GetParamValue();
1224         if ( name == "chrom" ) {
1225             SetChrom(value);
1226         }
1227         else if ( name == "span" ) {
1228             span = NStr::StringToUInt(value);
1229         }
1230         else {
1231             x_Error("Bad param name");
1232         }
1233     }
1234     if ( m_ChromId.empty() ) {
1235         x_Error("No chrom");
1236     }
1237     SValueInfo value;
1238     value.m_Span = span;
1239     while ( x_GetLine() ) {
1240         if ( !x_TryGetPos(value.m_Pos) ) {
1241             x_UngetLine();
1242             break;
1243         }
1244         x_SkipWS();
1245         x_GetDouble(value.m_Value);
1246         value.m_Pos -= 1;
1247         AddValue(value);
1248     }
1249 }
1250 
1251 
ReadBedLine(CTempString chrom)1252 void CWig2tableApplication::ReadBedLine(CTempString chrom)
1253 {
1254     if ( m_TrackType != eTrackType_bedGraph &&
1255          m_TrackType != eTrackType_invalid ) {
1256         x_Error("Track type=bedGraph is required");
1257     }
1258     SetChrom(chrom);
1259     SValueInfo value;
1260     x_SkipWS();
1261     x_GetPos(value.m_Pos);
1262     x_SkipWS();
1263     x_GetPos(value.m_Span);
1264     x_SkipWS();
1265     x_GetDouble(value.m_Value);
1266     value.m_Span -= value.m_Pos;
1267     AddValue(value);
1268 }
1269 
1270 
Run(void)1271 int CWig2tableApplication::Run(void)
1272 {
1273     SetDiagPostLevel(eDiag_Warning);
1274     // Get arguments
1275     const CArgs& args = GetArgs();
1276 
1277     if ( args["mapfile"] ) {
1278         m_IdMapper.reset(new CIdMapperConfig(args["mapfile"].AsInputFile(),
1279                                              args["genome"].AsString(),
1280                                              false));
1281     }
1282     else if ( !args["genome"].AsString().empty() ) {
1283         LOG_POST("Genome: "<<args["genome"].AsString());
1284         m_IdMapper.reset(new CIdMapperBuiltin(args["genome"].AsString(),
1285                                               false));
1286     }
1287 
1288     m_AsGraph = args["as-graph"];
1289     m_SingleAnnot = args["single-annot"];
1290     m_OmitZeros = args["omit-zeros"];
1291     m_JoinSame = args["join-same"];
1292     m_AsByte = m_AsGraph || args["as-byte"];
1293     m_KeepInteger = args["keep-integer"];
1294     m_GapValue = args["gap-value"].AsDouble();
1295     if ( m_AsGraph ) {
1296         if ( m_OmitZeros ) {
1297             ERR_POST(Warning<<
1298                      "The option -omit_zeros is ignored for Seq-graph");
1299             m_OmitZeros = false;
1300         }
1301         if ( m_JoinSame ) {
1302             ERR_POST(Warning<<
1303                      "The option -join_same is ignored for Seq-graph");
1304             m_JoinSame = false;
1305         }
1306     }
1307     else {
1308         if ( m_SingleAnnot ) {
1309             ERR_POST(Warning<<
1310                      "The option -single-annot is ignored for Seq-table");
1311             m_SingleAnnot = false;
1312         }
1313     }
1314 
1315     // Read the entry
1316     m_Input.reset(new CWigBufferedLineReader(args["input"].AsString()));
1317     m_Output.reset
1318         (CObjectOStream::Open(s_GetFormat(args["outfmt"].AsString()),
1319                               args["output"].AsOutputFile()));
1320 
1321     if ( args["name"] ) {
1322         m_TrackName = args["name"].AsString();
1323     }
1324     else if ( m_TrackName.empty() ) {
1325         m_TrackName = "User Track";
1326     }
1327 
1328     while ( x_GetLine() ) {
1329         if ( x_CommentLine() ) {
1330             continue;
1331         }
1332         CTempString s = x_GetWord();
1333         if ( s == "browser" ) {
1334             ReadBrowser();
1335         }
1336         else if ( s == "track" ) {
1337             ReadTrack();
1338         }
1339         else if ( s == "fixedStep" ) {
1340             ReadFixedStep();
1341         }
1342         else if ( s == "variableStep" ) {
1343             ReadVariableStep();
1344         }
1345         else {
1346             ReadBedLine(s);
1347         }
1348     }
1349     m_Input.reset();
1350 
1351     DumpChromValues();
1352     DumpAnnot();
1353     m_Output.reset();
1354 
1355     // Exit successfully
1356     return 0;
1357 }
1358 
1359 
1360 /////////////////////////////////////////////////////////////////////////////
1361 //  MAIN
1362 
1363 
main(int argc,const char * argv[])1364 int main(int argc, const char* argv[])
1365 {
1366     // Execute main application function
1367     return CWig2tableApplication().AppMain(argc, argv);
1368 }
1369